diff mbox

libstdc++/64797 fix handling of incomplete multibyte characters

Message ID 20150304172001.GI8789@redhat.com
State New
Headers show

Commit Message

Jonathan Wakely March 4, 2015, 5:20 p.m. UTC
To fix the non-portable 22_locale/conversions/string/2.cc test I
changed it to use char16_t and char32_t where I can reliably create an
invalid sequence that causes a conversion error. That revealed some
more problems in the Unicode conversion utilities, fixed by this patch
and verified by the new tests.

Most of the changes in codecvt.cc are just defining convenience
constants and inline functions, but there are some minor bugs fixed in
UTF-16 error handling too.

Tested x86_64-linux, committed to trunk.
diff mbox

Patch

commit ad00aa737cbaf61a4243a3ed46a5ed888811a1c5
Author: Jonathan Wakely <jwakely@redhat.com>
Date:   Thu Feb 19 11:44:58 2015 +0000

    	PR libstdc++/64797
    	* include/bits/locale_conv.h (wstring_convert::_M_conv): Handle
    	incomplete multibyte sequences correctly.
    	* include/std/codecvt (codecvt_utf8, codecvt_utf16,
    	codecvt_utf8_utf16): Limit _Maxcode to maximum Unicode code point.
    	* src/c++11/codecvt.cc (invalid_mb_sequence, incomplete_mb_character):
    	Define constants.
    	(is_high_surrogate, is_low_surrogate, surrogate_pair_to_code_point):
    	Define convenience functions.
    	(read_utf8_code_point): Return relevant constant to distinguish
    	incomplete characters from invalid sequences.
    	(read_utf16_code_point): Likewise. Check for invalid sequences.
    	(ucs4_in, utf16_in): Use incomplete_mb_character constant.
    	(utf16_out): Check for invalid sequences.
    	(utf16_span): Fix condition.
    	(ucs2_out): Use is_high_surrogate.
    	(ucs2_in): Use incomplete_mb_character constant and fix condition.
    	* testsuite/22_locale/codecvt/char16_t.cc: Fix whitespace.
    	* testsuite/22_locale/conversions/buffer/1.cc: New.
    	* testsuite/22_locale/conversions/string/2.cc: Use char16_t and
    	char32_t instead of wchar_t.
    	* testsuite/22_locale/conversions/string/3.cc: New.

diff --git a/libstdc++-v3/include/bits/locale_conv.h b/libstdc++-v3/include/bits/locale_conv.h
index c8a44f4..b53754d 100644
--- a/libstdc++-v3/include/bits/locale_conv.h
+++ b/libstdc++-v3/include/bits/locale_conv.h
@@ -198,18 +198,20 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	  auto __outstr = __err ? _OutStr(__err->get_allocator()) : _OutStr();
 	  size_t __outchars = 0;
 	  auto __next = __first;
+	  const auto __maxlen = _M_cvt->max_length();
 
 	  codecvt_base::result __result;
 	  do
 	    {
-	      __outstr.resize(__outstr.size() + (__last - __next));
+	      __outstr.resize(__outstr.size() + (__last - __next) + __maxlen);
 	      auto __outnext = &__outstr.front() + __outchars;
 	      auto const __outlast = &__outstr.back() + 1;
 	      __result = ((*_M_cvt).*__memfn)(_M_state, __next, __last, __next,
 					    __outnext, __outlast, __outnext);
 	      __outchars = __outnext - &__outstr.front();
 	    }
-	  while (__result == codecvt_base::partial && __next != __last);
+	  while (__result == codecvt_base::partial && __next != __last
+		 && (__outstr.size() - __outchars) < __maxlen);
 
 	  __outstr.resize(__outchars);
 	  _M_count = __next - __first;
@@ -428,7 +430,7 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	      return _M_put(__next, __pending);
 
 	    if (!_M_put(__outbuf, __outnext - __outbuf))
-		return false;
+	      return false;
 	  }
 	while (__next != __last && __next != __start);
 
diff --git a/libstdc++-v3/include/std/codecvt b/libstdc++-v3/include/std/codecvt
index d58a0ec..e4a7d5b 100644
--- a/libstdc++-v3/include/std/codecvt
+++ b/libstdc++-v3/include/std/codecvt
@@ -148,7 +148,9 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
     public: \
       explicit \
       _NAME(size_t __refs = 0) \
-      : __ ## _NAME ## _base<_ELEM>(_Maxcode, _Mode, __refs) { } \
+      : __ ## _NAME ## _base<_ELEM>(std::min(_Maxcode, 0x10fffful), \
+				    _Mode, __refs) \
+      { } \
     }
 
   template<typename _Elem> class __codecvt_utf8_base;
diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc
index aebd3f3..83ee6e0 100644
--- a/libstdc++-v3/src/c++11/codecvt.cc
+++ b/libstdc++-v3/src/c++11/codecvt.cc
@@ -35,8 +35,14 @@  namespace
 {
   // Largest code point that fits in a single UTF-16 code unit.
   const char32_t max_single_utf16_unit = 0xFFFF;
+
   const char32_t max_code_point = 0x10FFFF;
 
+  // The functions below rely on maxcode < incomplete_mb_character
+  // (which is enforced by the codecvt_utf* classes on construction).
+  const char32_t incomplete_mb_character = char32_t(-2);
+  const char32_t invalid_mb_sequence = char32_t(-1);
+
   template<typename Elem>
     struct range
     {
@@ -131,13 +137,13 @@  namespace
 
   // Read a codepoint from a UTF-8 multibyte sequence.
   // Updates from.next if the codepoint is not greater than maxcode.
-  // Returns -1 if there is an invalid or incomplete multibyte character.
+  // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
   char32_t
   read_utf8_code_point(range<const char>& from, unsigned long maxcode)
   {
-    size_t avail = from.size();
+    const size_t avail = from.size();
     if (avail == 0)
-      return -1;
+      return incomplete_mb_character;
     unsigned char c1 = from.next[0];
     // https://en.wikipedia.org/wiki/UTF-8#Sample_code
     if (c1 < 0x80)
@@ -146,14 +152,14 @@  namespace
       return c1;
     }
     else if (c1 < 0xC2) // continuation or overlong 2-byte sequence
-      return -1;
+      return invalid_mb_sequence;
     else if (c1 < 0xE0) // 2-byte sequence
     {
       if (avail < 2)
-	return -1;
+	return incomplete_mb_character;
       unsigned char c2 = from.next[1];
       if ((c2 & 0xC0) != 0x80)
-	return -1;
+	return invalid_mb_sequence;
       char32_t c = (c1 << 6) + c2 - 0x3080;
       if (c <= maxcode)
 	from.next += 2;
@@ -162,15 +168,15 @@  namespace
     else if (c1 < 0xF0) // 3-byte sequence
     {
       if (avail < 3)
-	return -1;
+	return incomplete_mb_character;
       unsigned char c2 = from.next[1];
       if ((c2 & 0xC0) != 0x80)
-	return -1;
+	return invalid_mb_sequence;
       if (c1 == 0xE0 && c2 < 0xA0) // overlong
-	return -1;
+	return invalid_mb_sequence;
       unsigned char c3 = from.next[2];
       if ((c3 & 0xC0) != 0x80)
-	return -1;
+	return invalid_mb_sequence;
       char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080;
       if (c <= maxcode)
 	from.next += 3;
@@ -179,27 +185,27 @@  namespace
     else if (c1 < 0xF5) // 4-byte sequence
     {
       if (avail < 4)
-	return -1;
+	return incomplete_mb_character;
       unsigned char c2 = from.next[1];
       if ((c2 & 0xC0) != 0x80)
-	return -1;
+	return invalid_mb_sequence;
       if (c1 == 0xF0 && c2 < 0x90) // overlong
-	return -1;
+	return invalid_mb_sequence;
       if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF
-      return -1;
+      return invalid_mb_sequence;
       unsigned char c3 = from.next[2];
       if ((c3 & 0xC0) != 0x80)
-	return -1;
+	return invalid_mb_sequence;
       unsigned char c4 = from.next[3];
       if ((c4 & 0xC0) != 0x80)
-	return -1;
+	return invalid_mb_sequence;
       char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080;
       if (c <= maxcode)
 	from.next += 4;
       return c;
     }
     else // > U+10FFFF
-      return -1;
+      return invalid_mb_sequence;
   }
 
   bool
@@ -250,27 +256,54 @@  namespace
 #endif
   }
 
+  // Return true if c is a high-surrogate (aka leading) code point.
+  inline bool
+  is_high_surrogate(char32_t c)
+  {
+    return c >= 0xD800 && c <= 0xDBFF;
+  }
+
+  // Return true if c is a low-surrogate (aka trailing) code point.
+  inline bool
+  is_low_surrogate(char32_t c)
+  {
+    return c >= 0xDC00 && c <= 0xDFFF;
+  }
+
+  inline char32_t
+  surrogate_pair_to_code_point(char32_t high, char32_t low)
+  {
+    return (high << 10) + low - 0x35FDC00;
+  }
+
   // Read a codepoint from a UTF-16 multibyte sequence.
   // The sequence's endianness is indicated by (mode & little_endian).
   // Updates from.next if the codepoint is not greater than maxcode.
-  // Returns -1 if there is an incomplete multibyte character.
+  // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
   char32_t
   read_utf16_code_point(range<const char16_t>& from, unsigned long maxcode,
 			codecvt_mode mode)
   {
+    const size_t avail = from.size();
+    if (avail == 0)
+      return incomplete_mb_character;
     int inc = 1;
     char32_t c = adjust_byte_order(from.next[0], mode);
-    if (c >= 0xD800 && c <= 0xDBFF)
+    if (is_high_surrogate(c))
       {
-	if (from.size() < 2)
-	  return -1;
+	if (avail < 2)
+	  return incomplete_mb_character;
 	const char16_t c2 = adjust_byte_order(from.next[1], mode);
-	if (c2 >= 0xDC00 && c2 <= 0xDFFF)
+	if (is_low_surrogate(c2))
 	  {
-	    c = (c << 10) + c2 - 0x35FDC00;
+	    c = surrogate_pair_to_code_point(c, c2);
 	    inc = 2;
 	  }
+	else
+	  return invalid_mb_sequence;
       }
+    else if (is_low_surrogate(c))
+      return invalid_mb_sequence;
     if (c <= maxcode)
       from.next += inc;
     return c;
@@ -314,8 +347,8 @@  namespace
     while (from.size() && to.size())
       {
 	const char32_t codepoint = read_utf8_code_point(from, maxcode);
-	if (codepoint == char32_t(-1))
-	  break;
+	if (codepoint == incomplete_mb_character)
+	  return codecvt_base::partial;
 	if (codepoint > maxcode)
 	  return codecvt_base::error;
 	*to.next++ = codepoint;
@@ -352,8 +385,8 @@  namespace
     while (from.size() && to.size())
       {
 	const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
-	if (codepoint == char32_t(-1))
-	  break;
+	if (codepoint == incomplete_mb_character)
+	  return codecvt_base::partial;
 	if (codepoint > maxcode)
 	  return codecvt_base::error;
 	*to.next++ = codepoint;
@@ -389,11 +422,9 @@  namespace
     read_utf8_bom(from, mode);
     while (from.size() && to.size())
       {
-	const char* first = from.next;
-	if ((unsigned char)*first >= 0xF0 && to.size() < 2)
-	  return codecvt_base::partial;
+	const char* const first = from.next;
 	const char32_t codepoint = read_utf8_code_point(from, maxcode);
-	if (codepoint == char32_t(-1))
+	if (codepoint == incomplete_mb_character)
 	  return codecvt_base::partial;
 	if (codepoint > maxcode)
 	  return codecvt_base::error;
@@ -418,20 +449,22 @@  namespace
       {
 	char32_t c = from.next[0];
 	int inc = 1;
-	if (c >= 0xD800 && c <= 0xDBFF) // start of surrogate pair
+	if (is_high_surrogate(c))
 	  {
 	    if (from.size() < 2)
 	      return codecvt_base::ok; // stop converting at this point
 
 	    const char32_t c2 = from.next[1];
-	    if (c2 >= 0xDC00 && c2 <= 0xDFFF)
+	    if (is_low_surrogate(c2))
 	      {
+		c = surrogate_pair_to_code_point(c, c2);
 		inc = 2;
-		c = (c << 10) + c2 - 0x35FDC00;
 	      }
 	    else
 	      return codecvt_base::error;
 	  }
+	else if (is_low_surrogate(c))
+	  return codecvt_base::error;
 	if (c > maxcode)
 	  return codecvt_base::error;
 	if (!write_utf8_code_point(to, c))
@@ -452,8 +485,8 @@  namespace
     while (count+1 < max)
       {
 	char32_t c = read_utf8_code_point(from, maxcode);
-	if (c == char32_t(-1))
-	  break;
+	if (c > maxcode)
+	  return from.next;
 	else if (c > max_single_utf16_unit)
 	  ++count;
 	++count;
@@ -489,7 +522,7 @@  namespace
     while (from.size() && to.size())
       {
 	char16_t c = from.next[0];
-	if (c >= 0xD800 && c <= 0xDBFF) // start of surrogate pair
+	if (is_high_surrogate(c))
 	  return codecvt_base::error;
 	if (c > maxcode)
 	  return codecvt_base::error;
@@ -510,9 +543,9 @@  namespace
     while (from.size() && to.size())
       {
 	const char32_t c = read_utf16_code_point(from, maxcode, mode);
-	if (c == char32_t(-1))
-	  break;
-	if (c >= maxcode)
+	if (c == incomplete_mb_character)
+	  return codecvt_base::partial;
+	if (c > maxcode)
 	  return codecvt_base::error;
 	*to.next++ = c;
       }
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc b/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc
index 9271eca..a21a838 100644
--- a/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc
@@ -79,8 +79,7 @@  test01()
 
     codecvt_c16::state_type state01;
     state01 = {};
-    codecvt_base::result res = cvt->out(state01, u16dat, u16dat_end,
-from_next,
+    codecvt_base::result res = cvt->out(state01, u16dat, u16dat_end, from_next,
                                         buffer, buffer_end, to_next);
 
     VERIFY(res == codecvt_base::ok);
diff --git a/libstdc++-v3/testsuite/22_locale/conversions/buffer/1.cc b/libstdc++-v3/testsuite/22_locale/conversions/buffer/1.cc
new file mode 100644
index 0000000..f008f5a
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/conversions/buffer/1.cc
@@ -0,0 +1,78 @@ 
+// { dg-options "-std=gnu++11" }
+
+// Copyright (C) 2012 Free Software Foundation
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// 22.3.3.2.3  Buffer conversions
+
+#include <locale>
+#include <sstream>
+#include <testsuite_hooks.h>
+
+template<typename Elem>
+struct cvt : std::codecvt<Elem, char, std::mbstate_t> { };
+
+template<typename Elem>
+using buf_conv = std::wbuffer_convert<cvt<Elem>, Elem>;
+
+using std::string;
+using std::stringstream;
+using std::wstring;
+using std::wstringstream;
+
+void test01()
+{
+  buf_conv<wchar_t> buf;
+  std::stringbuf sbuf;
+  VERIFY( buf.rdbuf() == nullptr );
+  VERIFY( buf.rdbuf(&sbuf) == nullptr );
+  VERIFY( buf.rdbuf() == &sbuf );
+  VERIFY( buf.rdbuf(nullptr) == &sbuf );
+}
+
+void test02()
+{
+  std::stringbuf sbuf;
+  buf_conv<char> buf(&sbuf);  // noconv
+
+  stringstream ss;
+  ss.std::ios::rdbuf(&buf);
+  string input = "King for a day...";
+  ss << input << std::flush;
+  string output = sbuf.str();
+  VERIFY( input == output );
+}
+
+void test03()
+{
+  std::stringbuf sbuf;
+  buf_conv<wchar_t> buf(&sbuf);
+
+  wstringstream ss;
+  ss.std::wios::rdbuf(&buf);
+  wstring input = L"Fool for a lifetime";
+  ss << input << std::flush;
+  string output = sbuf.str();
+  VERIFY( output == "Fool for a lifetime" );
+}
+
+int main()
+{
+  test01();
+  test02();
+  test03();
+}
diff --git a/libstdc++-v3/testsuite/22_locale/conversions/string/2.cc b/libstdc++-v3/testsuite/22_locale/conversions/string/2.cc
index 94eb75f..07d2b52 100644
--- a/libstdc++-v3/testsuite/22_locale/conversions/string/2.cc
+++ b/libstdc++-v3/testsuite/22_locale/conversions/string/2.cc
@@ -30,26 +30,43 @@  template<typename Elem>
 using str_conv = std::wstring_convert<cvt<Elem>, Elem>;
 
 using std::string;
-using std::wstring;
+using std::u16string;
+using std::u32string;
 
 // test conversion errors, with and without error strings
 
 void test01()
 {
-  typedef str_conv<wchar_t> sc;
+  typedef str_conv<char16_t> sc;
 
   const sc::byte_string berr = "invalid wide string";
-  const sc::wide_string werr = L"invalid byte string";
+  const sc::wide_string werr = u"invalid byte string";
 
   sc c(berr, werr);
   string input = "Stop";
+  input += char(0xFF);
+  u16string woutput = c.from_bytes(input);
+  VERIFY( werr == woutput );
+  u16string winput = u"Stop";
+  winput += char16_t(0xDC00);
+  string output = c.to_bytes(winput);
+  VERIFY( berr == output );
+}
+
+void test02()
+{
+  typedef str_conv<char32_t> sc;
+
+  const sc::byte_string berr = "invalid wide string";
+  const sc::wide_string werr = U"invalid byte string";
+
+  sc c(berr, werr);
+  string input = "Halt";
   input += char(0xff);
-  input += char(0xff);
-  wstring woutput = c.from_bytes(input);
+  u32string woutput = c.from_bytes(input);
   VERIFY( werr == woutput );
-  wstring winput = L"Stop";
-  winput += wchar_t(0xff);
-  winput += wchar_t(0xff);
+  u32string winput = U"Halt";
+  winput += char32_t(-1);
   string output = c.to_bytes(winput);
   VERIFY( berr == output );
 }
@@ -57,4 +74,5 @@  void test01()
 int main()
 {
   test01();
+  test02();
 }
diff --git a/libstdc++-v3/testsuite/22_locale/conversions/string/3.cc b/libstdc++-v3/testsuite/22_locale/conversions/string/3.cc
new file mode 100644
index 0000000..7c4ac20
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/conversions/string/3.cc
@@ -0,0 +1,61 @@ 
+// { dg-options "-std=gnu++11" }
+
+// Copyright (C) 2012 Free Software Foundation
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// 22.3.3.2.2  String conversions
+
+#include <locale>
+#include <string>
+#include <testsuite_hooks.h>
+
+template<typename Elem>
+struct cvt : std::codecvt<Elem, char, std::mbstate_t> { };
+
+template<typename Elem>
+using str_conv = std::wstring_convert<cvt<Elem>, Elem>;
+
+using std::string;
+using std::u32string;
+
+// test construction with state, for partial conversions
+
+void test01()
+{
+  typedef str_conv<char32_t> wsc;
+
+  wsc c;
+  string input = u8"\u00a3 shillings pence";
+  u32string woutput = c.from_bytes(input.substr(0, 1));
+  auto partial_state = c.state();
+  auto partial_count = c.converted();
+
+  auto woutput2 = c.from_bytes("state reset on next conversion");
+  VERIFY( woutput2 == U"state reset on next conversion" );
+
+  wsc c2(new cvt<char32_t>, partial_state);
+  woutput += c2.from_bytes(input.substr(partial_count));
+  VERIFY( U"\u00a3 shillings pence" == woutput );
+
+  string roundtrip = c2.to_bytes(woutput);
+  VERIFY( input == roundtrip );
+}
+
+int main()
+{
+  test01();
+}