diff mbox

Various fixes for <codecvt> facets

Message ID 20170313193547.GW3501@redhat.com
State New
Headers show

Commit Message

Jonathan Wakely March 13, 2017, 7:36 p.m. UTC
This is a series of patches to fix various bugs in the Unicode
character conversion facets.

Ther first patch fixes a silly < versus <= bug that meant that 0xffff
got written as a surrogate pair instead of as simply 0xff, and an
endianness bug for the internal representation of UTF-16 code units
stored in char32_t or wchar_t values. That's PR 79511.

The second patch fixes some incorrect bitwise operations (because I
confused & and |) and some incorrect limits (because I confused max
and min). That fixes determining the endianness of the external
representation bytes when they start with a Byte OrderMark, and
correctly reports errors on invalid UCS2. It also fixes
wstring_convert so that it reports the number of characters that were
converted prior to an error. That's PR 79980.

The third patch fixes the output of the encoding() and max_length()
member functions on the codecvt facets, because I wasn't correctly
accounting for a BOM or for the differences between UTF-16 and UCS2.

I plan to commit these for all branches, but I'll wait until after GCC
7.1 is released, and fix it for 7.2 instead. These bugs aren't
important enough to rush into trunk now.
commit c5bbc9258a7182e14eb731e5251842bc417b5822
Author: Jonathan Wakely <jwakely@redhat.com>
Date:   Fri Mar 10 20:12:09 2017 +0000

    PR libstdc++/79511 fix endianness of UTF-16 data
    
    	PR libstdc++/79511
    	* src/c++11/codecvt.cc (write_utf16_code_point): Don't write 0xffff
    	as a surrogate pair.
    	(__codecvt_utf8_utf16_base<char32_t>::do_in): Use native endianness
    	for internal representation.
    	(__codecvt_utf8_utf16_base<wchar_t>::do_in): Likewise.
    	* testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc: New test.
commit dc9f4c953aa1600978877a90763122f0104e6c4c
Author: Jonathan Wakely <jwakely@redhat.com>
Date:   Sat Mar 11 03:39:30 2017 +0000

    PR libstdc++/79980 fix BOM detection, maxcode checks, UCS2 handling
    
    	PR libstdc++/79980
    	* include/bits/locale_conv.h (__do_str_codecvt): Set __count on
    	error path.
    	* src/c++11/codecvt.cc (operator&=, operator|=, operator~): Overloads
    	for manipulating codecvt_mode values.
    	(read_utf16_bom): Compare input to BOM constants instead of integral
    	constants that depend on endianness.  Take mode parameter by
    	reference and adjust it, to distinguish between no BOM present and
    	UTF-16BE BOM present.
    	(ucs4_in, ucs2_span, ucs4_span): Adjust calls to read_utf16_bom.
    	(surrogates): New enumeration type.
    	(utf16_in, utf16_out): Add surrogates parameter to choose between
    	UTF-16 and UCS2 behaviour.
    	(utf16_span, ucs2_span): Use std::min not std::max.
    	(ucs2_out): Use std::min not std::max.  Disallow surrogate pairs.
    	(ucs2_in): Likewise. Adjust calls to read_utf16_bom.
    	* testsuite/22_locale/codecvt/codecvt_utf16/79980.cc: New test.
    	* testsuite/22_locale/codecvt/codecvt_utf8/79980.cc: New test.

diff --git a/libstdc++-v3/include/bits/locale_conv.h b/libstdc++-v3/include/bits/locale_conv.h
index cd8f146..9b952d4 100644
--- a/libstdc++-v3/include/bits/locale_conv.h
+++ b/libstdc++-v3/include/bits/locale_conv.h
@@ -81,7 +81,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	     && (__outstr.size() - __outchars) < __maxlen);
 
       if (__result == codecvt_base::error)
-	return false;
+	{
+	  __count = __next - __first;
+	  return false;
+	}
 
       if (__result == codecvt_base::noconv)
 	{
diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc
index 9b63e2b..a50804c 100644
--- a/libstdc++-v3/src/c++11/codecvt.cc
+++ b/libstdc++-v3/src/c++11/codecvt.cc
@@ -24,13 +24,27 @@
 
 #include <codecvt>
 #include <cstring>		// std::memcpy, std::memcmp
-#include <bits/stl_algobase.h>	// std::max
+#include <bits/stl_algobase.h>	// std::min
 
 #ifdef _GLIBCXX_USE_C99_STDINT_TR1
 namespace std _GLIBCXX_VISIBILITY(default)
 {
 _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
+  // The standard doesn't define these operators, which is annoying.
+  static underlying_type<codecvt_mode>::type
+  to_integer(codecvt_mode m)
+  { return static_cast<mode_t>(m); }
+
+  static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
+  { return m = codecvt_mode(to_integer(m) & to_integer(n)); }
+
+  static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
+  { return m = codecvt_mode(to_integer(m) | to_integer(n)); }
+
+  static codecvt_mode operator~(codecvt_mode m)
+  { return codecvt_mode(~to_integer(m)); }
+
 namespace
 {
   // Largest code point that fits in a single UTF-16 code unit.
@@ -117,22 +131,26 @@ namespace
       read_bom(from, utf8_bom);
   }
 
-  // If consume_header is set in mode update from.next to after any BOM.
-  // Return little_endian iff the UTF-16LE BOM was present.
-  codecvt_mode
-  read_utf16_bom(range<const char16_t>& from, codecvt_mode mode)
+  // If consume_header is not set in mode, no effects.
+  // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
+  // - if the UTF-16BE BOM was found unset little_endian in mode, or
+  // - if the UTF-16LE BOM was found set little_endian in mode.
+  void
+  read_utf16_bom(range<const char16_t>& from, codecvt_mode& mode)
   {
     if (mode & consume_header && from.size())
       {
-	if (*from.next == 0xFEFF)
-	  ++from.next;
-	else if (*from.next == 0xFFFE)
+	if (!memcmp(from.next, utf16_bom, 2))
 	  {
 	    ++from.next;
-	    return little_endian;
+	    mode &= ~little_endian;
+	  }
+	else if (!memcmp(from.next, utf16le_bom, 2))
+	  {
+	    ++from.next;
+	    mode |= little_endian;
 	  }
       }
-    return {};
   }
 
   // Read a codepoint from a UTF-8 multibyte sequence.
@@ -380,8 +398,7 @@ namespace
   ucs4_in(range<const char16_t>& from, range<char32_t>& to,
           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
   {
-    if (read_utf16_bom(from, mode) == little_endian)
-      mode = codecvt_mode(mode & little_endian);
+    read_utf16_bom(from, mode);
     while (from.size() && to.size())
       {
 	const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
@@ -413,11 +430,15 @@ namespace
     return codecvt_base::ok;
   }
 
-  // utf8 -> utf16
+  // Flag indicating whether to process UTF-16 or UCS2
+  enum class surrogates { allowed, disallowed };
+
+  // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
   template<typename C>
   codecvt_base::result
   utf16_in(range<const char>& from, range<C>& to,
-           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
+	   unsigned long maxcode = max_code_point, codecvt_mode mode = {},
+	   surrogates s = surrogates::allowed)
   {
     read_utf8_bom(from, mode);
     while (from.size() && to.size())
@@ -425,7 +446,12 @@ namespace
 	const char* const first = from.next;
 	const char32_t codepoint = read_utf8_code_point(from, maxcode);
 	if (codepoint == incomplete_mb_character)
-	  return codecvt_base::partial;
+	  {
+	    if (s == surrogates::allowed)
+	      return codecvt_base::partial;
+	    else
+	      return codecvt_base::error; // No surrogates in UCS2
+	  }
 	if (codepoint > maxcode)
 	  return codecvt_base::error;
 	if (!write_utf16_code_point(to, codepoint, mode))
@@ -437,11 +463,12 @@ namespace
     return codecvt_base::ok;
   }
 
-  // utf16 -> utf8
+  // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
   template<typename C>
   codecvt_base::result
   utf16_out(range<const C>& from, range<char>& to,
-            unsigned long maxcode = max_code_point, codecvt_mode mode = {})
+	    unsigned long maxcode = max_code_point, codecvt_mode mode = {},
+	    surrogates s = surrogates::allowed)
   {
     if (!write_utf8_bom(to, mode))
       return codecvt_base::partial;
@@ -451,6 +478,9 @@ namespace
 	int inc = 1;
 	if (is_high_surrogate(c))
 	  {
+	    if (s == surrogates::disallowed)
+	      return codecvt_base::error; // No surrogates in UCS-2
+
 	    if (from.size() < 2)
 	      return codecvt_base::ok; // stop converting at this point
 
@@ -492,7 +522,7 @@ namespace
 	++count;
       }
     if (count+1 == max) // take one more character if it fits in a single unit
-      read_utf8_code_point(from, std::max(max_single_utf16_unit, maxcode));
+      read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
     return from.next;
   }
 
@@ -501,7 +531,9 @@ namespace
   ucs2_in(range<const char>& from, range<char16_t>& to,
 	  char32_t maxcode = max_code_point, codecvt_mode mode = {})
   {
-    return utf16_in(from, to, std::max(max_single_utf16_unit, maxcode), mode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
+    return utf16_in(from, to, maxcode, mode, surrogates::disallowed);
   }
 
   // ucs2 -> utf8
@@ -509,7 +541,9 @@ namespace
   ucs2_out(range<const char16_t>& from, range<char>& to,
 	   char32_t maxcode = max_code_point, codecvt_mode mode = {})
   {
-    return utf16_out(from, to, std::max(max_single_utf16_unit, maxcode), mode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
+    return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
   }
 
   // ucs2 -> utf16
@@ -537,14 +571,14 @@ namespace
   ucs2_in(range<const char16_t>& from, range<char16_t>& to,
 	  char32_t maxcode = max_code_point, codecvt_mode mode = {})
   {
-    if (read_utf16_bom(from, mode) == little_endian)
-      mode = codecvt_mode(mode & little_endian);
-    maxcode = std::max(max_single_utf16_unit, maxcode);
+    read_utf16_bom(from, mode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
     while (from.size() && to.size())
       {
 	const char32_t c = read_utf16_code_point(from, maxcode, mode);
 	if (c == incomplete_mb_character)
-	  return codecvt_base::partial;
+	  return codecvt_base::error; // UCS-2 only supports single units.
 	if (c > maxcode)
 	  return codecvt_base::error;
 	*to.next++ = c;
@@ -557,9 +591,9 @@ namespace
             char32_t maxcode, codecvt_mode mode)
   {
     range<const char16_t> from{ begin, end };
-    if (read_utf16_bom(from, mode) == little_endian)
-      mode = codecvt_mode(mode & little_endian);
-    maxcode = std::max(max_single_utf16_unit, maxcode);
+    read_utf16_bom(from, mode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
     char32_t c = 0;
     while (max-- && c <= maxcode)
       c = read_utf16_code_point(from, maxcode, mode);
@@ -572,7 +606,8 @@ namespace
   {
     range<const char> from{ begin, end };
     read_utf8_bom(from, mode);
-    maxcode = std::max(max_single_utf16_unit, maxcode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
     char32_t c = 0;
     while (max-- && c <= maxcode)
       c = read_utf8_code_point(from, maxcode);
@@ -598,8 +633,7 @@ namespace
             char32_t maxcode = max_code_point, codecvt_mode mode = {})
   {
     range<const char16_t> from{ begin, end };
-    if (read_utf16_bom(from, mode) == little_endian)
-      mode = codecvt_mode(mode & little_endian);
+    read_utf16_bom(from, mode);
     char32_t c = 0;
     while (max-- && c <= maxcode)
       c = read_utf16_code_point(from, maxcode, mode);
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc
new file mode 100644
index 0000000..9383818
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc
@@ -0,0 +1,115 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <locale>
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+// PR libstdc++/79980
+
+constexpr std::codecvt_mode mode(std::codecvt_mode m)
+{ return static_cast<std::codecvt_mode>(m | std::consume_header); }
+
+template<typename WCh, unsigned long Max = 0x10FFFF,
+	 std::codecvt_mode Mode = std::consume_header>
+  using Conv
+    = std::wstring_convert<std::codecvt_utf16<WCh, Max, mode(Mode)>, WCh>;
+
+void
+test01()
+{
+  const char src[] = "\xFE\xFF\xAB\xCD";
+  Conv<char16_t> conv;
+  auto dst = conv.from_bytes(src, src+4);
+  VERIFY( dst[0] == 0xabcd );
+}
+
+void
+test02()
+{
+  const char src[] = "\xFF\xFE\xAB\xCD";
+  Conv<char16_t> conv;
+  auto dst = conv.from_bytes(src, src+4);
+  VERIFY( dst[0] == 0xcdab );
+}
+
+void
+test03()
+{
+  const char src[] = "\xFE\xFF\xAB\xCD";
+  Conv<char16_t, 0x10FFFF, std::little_endian> conv;
+  auto dst = conv.from_bytes(src, src+4);
+  VERIFY( dst[0] == 0xabcd );
+}
+
+void
+test04()
+{
+  const char src[] = "\xFF\xFE\xAB\xCD";
+  Conv<char16_t, 0x10FFFF, std::little_endian> conv;
+  auto dst = conv.from_bytes(src, src+4);
+  VERIFY( dst[0] == 0xcdab );
+}
+
+void
+test05()
+{
+  const char src[] = "\0\x61\xAB\xCD"; // character greater than 0x00FF
+  Conv<char16_t, 0xFF> conv("to_bytes failed", u"from_bytes failed");
+  std::u16string result = conv.from_bytes(src, src+4);
+  VERIFY( result == u"from_bytes failed" );
+  VERIFY( conv.converted() == 2 );
+}
+
+void
+test06()
+{
+  const char src[] = "\0\x61\xAB\xCD";
+  Conv<char16_t> conv("to_bytes failed", u"from_bytes failed");
+  std::u16string result = conv.from_bytes(src, src+3); // incomplete character
+  VERIFY( result == u"from_bytes failed" );
+  VERIFY( conv.converted() == 2 );
+}
+
+void
+test07()
+{
+  Conv<char16_t> conv("to_bytes failed", u"from_bytes failed");
+  // ucs2 to utf-16 conversion should fail on invalid ucs2 input:
+  std::u16string utf16 = u"1234\U00001111\U0001ffff";
+  auto out = conv.to_bytes(utf16);
+  VERIFY( out == "to_bytes failed" );
+  VERIFY( conv.converted() == 5 );
+
+  // And should also fail on incomplete surrogate pair (not return partial):
+  out = conv.to_bytes(utf16.substr(0, utf16.size()-1));
+  VERIFY( out == "to_bytes failed" );
+  VERIFY( conv.converted() == 5 );
+}
+
+int main()
+{
+  test01();
+  test02();
+  test03();
+  test04();
+  test05();
+  test06();
+  test07();
+}
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc
new file mode 100644
index 0000000..1251acb
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc
@@ -0,0 +1,94 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <codecvt>
+#include <locale>
+#include <string>
+#include <testsuite_hooks.h>
+
+using std::wstring_convert;
+using std::codecvt_utf8;
+
+void
+test01()
+{
+  std::string src = u8"1234\U00001111\U0001ffff";
+  wstring_convert<codecvt_utf8<char16_t>, char16_t> c("bad", u"BAD");
+
+  // utf-8 to ucs2 conversion should fail on character outside BMP
+  auto ucs2 = c.from_bytes(src);
+  VERIFY( ucs2 == u"BAD" );
+  VERIFY( c.converted() == 7 );
+
+  // ucs2 to utf-8 conversion should fail on invalid ucs2 input:
+  std::u16string utf16 = u"1234\U00001111\U0001ffff";
+  auto out = c.to_bytes(utf16);
+  VERIFY( out == "bad" );
+  VERIFY( c.converted() == 5 );
+
+  // And should also fail on incomplete surrogate pair (not return partial):
+  out = c.to_bytes(utf16.substr(0, utf16.size()-1));
+  VERIFY( out == "bad" );
+  VERIFY( c.converted() == 5 );
+}
+
+void
+test02()
+{
+  std::string src = u8"1234\U00001111\U0001ffff";
+  wstring_convert<codecvt_utf8<char16_t, 0x1000>, char16_t> c("bad", u"BAD");
+
+  // utf-8 to ucs2 conversion should fail on character above Maxcode=0x1000
+  auto ucs2 = c.from_bytes(src);
+  VERIFY( ucs2 == u"BAD" );
+  VERIFY( c.converted() == 4 );
+}
+
+void
+test03()
+{
+  std::string src = u8"1234\U00001111\U0001ffff";
+  wstring_convert<codecvt_utf8<char32_t, 0x10000>, char32_t> c("bad", U"BAD");
+
+  // utf-8 to ucs4 conversion should fail on character above Maxcode=0x10000
+  auto ucs4 = c.from_bytes(src);
+  VERIFY( ucs4 == U"BAD" );
+  VERIFY( c.converted() == 7 );
+}
+
+void
+test04()
+{
+  std::string src = u8"1234\U00001111\U0001ffff";
+  wstring_convert<codecvt_utf8<char32_t, 0x1000>, char32_t> c("bad", U"BAD");
+
+  // utf-8 to ucs4 conversion should fail on character above Maxcode=0x1000
+  auto ucs4 = c.from_bytes(src);
+  VERIFY( ucs4 == U"BAD" );
+  VERIFY( c.converted() == 4 );
+}
+
+int
+main()
+{
+  test01();
+  test02();
+  test03();
+  test04();
+}
commit 1618fc19cba68e26def23cdf9ad980fa5e672683
Author: Jonathan Wakely <jwakely@redhat.com>
Date:   Sat Mar 11 14:15:38 2017 +0000

    Fix encoding() and max_length() values for codecvt facets
    
    	* src/c++11/codecvt.cc (codecvt<char16_t, char, mbstate_t>)
    	(codecvt<char32_t, char, mbstate_t>, __codecvt_utf8_base<char16_t>)
    	(__codecvt_utf8_base<char32_t>, __codecvt_utf8_base<wchar_t>)
    	(__codecvt_utf16_base<char16_t>, __codecvt_utf16_base<char32_t>)
    	(__codecvt_utf16_base<wchar_t>, __codecvt_utf8_utf16_base<char16_t>)
    	(__codecvt_utf8_utf16_base<char32_t>)
    	(__codecvt_utf8_utf16_base<wchar_t>): Fix do_encoding() and
    	do_max_length() return values.
    	* testsuite/22_locale/codecvt/codecvt_utf16/members.cc: New test.
    	* testsuite/22_locale/codecvt/codecvt_utf8/members.cc: New test.
    	* testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc: New test.

diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc
index a50804c..9c91725 100644
--- a/libstdc++-v3/src/c++11/codecvt.cc
+++ b/libstdc++-v3/src/c++11/codecvt.cc
@@ -72,8 +72,8 @@ namespace
 
   // Multibyte sequences can have "header" consisting of Byte Order Mark
   const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF };
-  const unsigned char utf16_bom[4] = { 0xFE, 0xFF };
-  const unsigned char utf16le_bom[4] = { 0xFF, 0xFE };
+  const unsigned char utf16_bom[2] = { 0xFE, 0xFF };
+  const unsigned char utf16le_bom[2] = { 0xFF, 0xFE };
 
   template<size_t N>
     inline bool
@@ -695,7 +695,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 codecvt<char16_t, char, mbstate_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
 
 bool
 codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw()
@@ -713,9 +713,9 @@ do_length(state_type&, const extern_type* __from,
 int
 codecvt<char16_t, char, mbstate_t>::do_max_length() const throw()
 {
-  // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
-  // whereas 4 byte sequences require two 16-bit code units.
-  return 3;
+  // A single character (one or two UTF-16 code units) requires
+  // up to four UTF-8 code units.
+  return 4;
 }
 
 // Define members of codecvt<char32_t, char, mbstate_t> specialization.
@@ -766,7 +766,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 codecvt<char32_t, char, mbstate_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
 
 bool
 codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw()
@@ -783,7 +783,11 @@ do_length(state_type&, const extern_type* __from,
 
 int
 codecvt<char32_t, char, mbstate_t>::do_max_length() const throw()
-{ return 4; }
+{
+  // A single character (one UTF-32 code unit) requires
+  // up to 4 UTF-8 code units.
+  return 4;
+}
 
 // Define members of codecvt_utf8<char16_t> base class implementation.
 // Converts from UTF-8 to UCS-2.
@@ -835,7 +839,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 __codecvt_utf8_base<char16_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
 
 bool
 __codecvt_utf8_base<char16_t>::do_always_noconv() const throw()
@@ -852,7 +856,14 @@ do_length(state_type&, const extern_type* __from,
 
 int
 __codecvt_utf8_base<char16_t>::do_max_length() const throw()
-{ return 3; }
+{
+  // A single UCS-2 character requires up to three UTF-8 code units.
+  // (UCS-2 cannot represent characters that use four UTF-8 code units).
+  int max = 3;
+  if (_M_mode & consume_header)
+    max += sizeof(utf8_bom);
+  return max;
+}
 
 // Define members of codecvt_utf8<char32_t> base class implementation.
 // Converts from UTF-8 to UTF-32 (aka UCS-4).
@@ -900,7 +911,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 __codecvt_utf8_base<char32_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
 
 bool
 __codecvt_utf8_base<char32_t>::do_always_noconv() const throw()
@@ -917,7 +928,13 @@ do_length(state_type&, const extern_type* __from,
 
 int
 __codecvt_utf8_base<char32_t>::do_max_length() const throw()
-{ return 4; }
+{
+  // A single UCS-4 character requires up to four UTF-8 code units.
+  int max = 4;
+  if (_M_mode & consume_header)
+    max += sizeof(utf8_bom);
+  return max;
+}
 
 #ifdef _GLIBCXX_USE_WCHAR_T
 // Define members of codecvt_utf8<wchar_t> base class implementation.
@@ -992,7 +1009,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 __codecvt_utf8_base<wchar_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
 
 bool
 __codecvt_utf8_base<wchar_t>::do_always_noconv() const throw()
@@ -1015,7 +1032,16 @@ do_length(state_type&, const extern_type* __from,
 
 int
 __codecvt_utf8_base<wchar_t>::do_max_length() const throw()
-{ return 4; }
+{
+#if __SIZEOF_WCHAR_T__ == 2
+  int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length()
+#else
+  int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length()
+#endif
+  if (_M_mode & consume_header)
+    max += sizeof(utf8_bom);
+  return max;
+}
 #endif
 
 // Define members of codecvt_utf16<char16_t> base class implementation.
@@ -1070,7 +1096,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 __codecvt_utf16_base<char16_t>::do_encoding() const throw()
-{ return 1; }
+{ return 0; } // UTF-16 is not a fixed-width encoding
 
 bool
 __codecvt_utf16_base<char16_t>::do_always_noconv() const throw()
@@ -1089,7 +1115,14 @@ do_length(state_type&, const extern_type* __from,
 
 int
 __codecvt_utf16_base<char16_t>::do_max_length() const throw()
-{ return 3; }
+{
+  // A single UCS-2 character requires one UTF-16 code unit (so two chars).
+  // (UCS-2 cannot represent characters that use multiple UTF-16 code units).
+  int max = 2;
+  if (_M_mode & consume_header)
+    max += sizeof(utf16_bom);
+  return max;
+}
 
 // Define members of codecvt_utf16<char32_t> base class implementation.
 // Converts from UTF-16 to UTF-32 (aka UCS-4).
@@ -1143,7 +1176,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 __codecvt_utf16_base<char32_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-16 is not a fixed-width encoding
 
 bool
 __codecvt_utf16_base<char32_t>::do_always_noconv() const throw()
@@ -1162,7 +1195,14 @@ do_length(state_type&, const extern_type* __from,
 
 int
 __codecvt_utf16_base<char32_t>::do_max_length() const throw()
-{ return 4; }
+{
+  // A single UCS-4 character requires one or two UTF-16 code units
+  // (so up to four chars).
+  int max = 4;
+  if (_M_mode & consume_header)
+    max += sizeof(utf16_bom);
+  return max;
+}
 
 #ifdef _GLIBCXX_USE_WCHAR_T
 // Define members of codecvt_utf16<wchar_t> base class implementation.
@@ -1237,7 +1277,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 __codecvt_utf16_base<wchar_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-16 is not a fixed-width encoding
 
 bool
 __codecvt_utf16_base<wchar_t>::do_always_noconv() const throw()
@@ -1261,7 +1301,16 @@ do_length(state_type&, const extern_type* __from,
 
 int
 __codecvt_utf16_base<wchar_t>::do_max_length() const throw()
-{ return 4; }
+{
+#if __SIZEOF_WCHAR_T__ == 2
+  int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length()
+#else
+  int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length()
+#endif
+  if (_M_mode & consume_header)
+    max += sizeof(utf16_bom);
+  return max;
+}
 #endif
 
 // Define members of codecvt_utf8_utf16<char16_t> base class implementation.
@@ -1314,7 +1363,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 __codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
 
 bool
 __codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw()
@@ -1332,9 +1381,12 @@ do_length(state_type&, const extern_type* __from,
 int
 __codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw()
 {
-  // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
-  // whereas 4 byte sequences require two 16-bit code units.
-  return 3;
+  // A single character can be 1 or 2 UTF-16 code units,
+  // requiring up to 4 UTF-8 code units.
+  int max = 4;
+  if (_M_mode & consume_header)
+    max += sizeof(utf8_bom);
+  return max;
 }
 
 // Define members of codecvt_utf8_utf16<char32_t> base class implementation.
@@ -1387,7 +1439,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 __codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
 
 bool
 __codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw()
@@ -1405,9 +1457,12 @@ do_length(state_type&, const extern_type* __from,
 int
 __codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw()
 {
-  // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
-  // whereas 4 byte sequences require two 16-bit code units.
-  return 3;
+  // A single character can be 1 or 2 UTF-16 code units,
+  // requiring up to 4 UTF-8 code units.
+  int max = 4;
+  if (_M_mode & consume_header)
+    max += sizeof(utf8_bom);
+  return max;
 }
 
 #ifdef _GLIBCXX_USE_WCHAR_T
@@ -1461,7 +1516,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 __codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
 
 bool
 __codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw()
@@ -1479,9 +1534,12 @@ do_length(state_type&, const extern_type* __from,
 int
 __codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw()
 {
-  // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
-  // whereas 4 byte sequences require two 16-bit code units.
-  return 3;
+  // A single character can be 1 or 2 UTF-16 code units,
+  // requiring up to 4 UTF-8 code units.
+  int max = 4;
+  if (_M_mode & consume_header)
+    max += sizeof(utf8_bom);
+  return max;
 }
 #endif
 
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc b/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc
index b40fc65..3288e77 100644
--- a/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc
@@ -34,7 +34,7 @@ test01()
   const codecvt_c16* const cvt = &use_facet<codecvt_c16>(loc_c);
 
   VERIFY(!cvt->always_noconv());
-  VERIFY(cvt->max_length() == 3);
+  VERIFY(cvt->max_length() == 4);
   VERIFY(cvt->encoding() == 0);
 
   const char u8dat[] = u8"H\U000000E4ll\U000000F6 \U0001F63F \U000056FD "
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/members.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/members.cc
new file mode 100644
index 0000000..993c860
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/members.cc
@@ -0,0 +1,81 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+const int bomlen = 2; // UTF-16 BOM is 16 bits
+
+void
+test01()
+{
+  const int maxlen = 2;
+
+  std::codecvt_utf16<char16_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf16<char16_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test02()
+{
+  const int maxlen = 4;
+
+  std::codecvt_utf16<char32_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf16<char32_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test03()
+{
+#ifdef _GLIBCXX_USE_WCHAR_T
+  const int maxlen = sizeof(wchar_t) == 4 ? 4 : 2;
+
+  std::codecvt_utf16<wchar_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf16<wchar_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+#endif
+}
+
+int
+main()
+{
+  test01();
+  test02();
+  test03();
+}
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/members.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/members.cc
new file mode 100644
index 0000000..baeb049
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/members.cc
@@ -0,0 +1,81 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+const int bomlen = 3; // UTF-8 BOM is 24 bits
+
+void
+test01()
+{
+  const int maxlen = 3;
+
+  std::codecvt_utf8<char16_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf8<char16_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test02()
+{
+  const int maxlen = 4;
+
+  std::codecvt_utf8<char32_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf8<char32_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test03()
+{
+#ifdef _GLIBCXX_USE_WCHAR_T
+  const int maxlen = sizeof(wchar_t) == 4 ? 4 : 3;
+
+  std::codecvt_utf8<wchar_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf8<wchar_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+#endif
+}
+
+int
+main()
+{
+  test01();
+  test02();
+  test03();
+}
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc
new file mode 100644
index 0000000..8fcdfff
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc
@@ -0,0 +1,76 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+const int bomlen = 3; // UTF-8 BOM is 24 bits
+const int maxlen = 4;
+
+void
+test01()
+{
+  std::codecvt_utf8_utf16<char16_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf8_utf16<char16_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test02()
+{
+  std::codecvt_utf8_utf16<char32_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf8_utf16<char32_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test03()
+{
+#ifdef _GLIBCXX_USE_WCHAR_T
+  std::codecvt_utf8_utf16<wchar_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf8_utf16<wchar_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+#endif
+}
+
+int
+main()
+{
+  test01();
+  test02();
+  test03();
+}
diff mbox

Patch

diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc
index 12a4d4f..9b63e2b 100644
--- a/libstdc++-v3/src/c++11/codecvt.cc
+++ b/libstdc++-v3/src/c++11/codecvt.cc
@@ -315,7 +315,7 @@  namespace
   {
     static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit");
 
-    if (codepoint < max_single_utf16_unit)
+    if (codepoint <= max_single_utf16_unit)
       {
 	if (to.size() > 0)
 	  {
@@ -1341,7 +1341,11 @@  do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 {
   range<const char> from{ __from, __from_end };
   range<char32_t> to{ __to, __to_end };
-  auto res = utf16_in(from, to, _M_maxcode, _M_mode);
+  codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
+#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
+  mode = codecvt_mode(mode | little_endian);
+#endif
+  auto res = utf16_in(from, to, _M_maxcode, mode);
   __from_next = from.next;
   __to_next = to.next;
   return res;
@@ -1411,7 +1415,11 @@  do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 {
   range<const char> from{ __from, __from_end };
   range<wchar_t> to{ __to, __to_end };
-  auto res = utf16_in(from, to, _M_maxcode, _M_mode);
+  codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
+#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
+  mode = codecvt_mode(mode | little_endian);
+#endif
+  auto res = utf16_in(from, to, _M_maxcode, mode);
   __from_next = from.next;
   __to_next = to.next;
   return res;
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc
new file mode 100644
index 0000000..5555bcb
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc
@@ -0,0 +1,60 @@ 
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <locale>
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+// PR libstdc++/79511
+
+template<typename ElemT>
+  std::basic_string<ElemT> conv(const char* src)
+  {
+    std::wstring_convert<std::codecvt_utf8_utf16<ElemT>, ElemT> conv;
+    return conv.from_bytes(src);
+  }
+
+void
+test01()
+{
+  static char const src[] = "\xEF\xBF\xBF";
+  VERIFY( conv<char16_t>(src) == u"\xffff" );
+  VERIFY( conv<char32_t>(src) == U"\xffff" );
+#ifdef _GLIBCXX_USE_WCHAR_T
+  VERIFY( conv<wchar_t>(src) == L"\xffff" );
+#endif
+}
+
+void
+test02()
+{
+  static char const src[] = "\xE2\x82\xAC";
+  VERIFY( conv<char16_t>(src) == u"\x20ac" );
+  VERIFY( conv<char32_t>(src) == U"\x20ac" );
+#ifdef _GLIBCXX_USE_WCHAR_T
+  VERIFY( conv<wchar_t>(src) == L"\x20ac" );
+#endif
+}
+
+int
+main()
+{
+  test01();
+  test02();
+}