From b6584a72ac8d305731e1771a05c117dc11a3d553 Mon Sep 17 00:00:00 2001
From: Jonathan Wakely <jwakely@redhat.com>
Date: Wed, 4 Mar 2015 17:19:55 +0000
Subject: [PATCH] re PR libstdc++/64797 (22_locale/conversions/string/2.cc
 FAILs)

	PR libstdc++/64797
	* include/bits/locale_conv.h (wstring_convert::_M_conv): Handle
	incomplete multibyte sequences correctly.
	* include/std/codecvt (codecvt_utf8, codecvt_utf16,
	codecvt_utf8_utf16): Limit _Maxcode to maximum Unicode code point.
	* src/c++11/codecvt.cc (invalid_mb_sequence, incomplete_mb_character):
	Define constants.
	(is_high_surrogate, is_low_surrogate, surrogate_pair_to_code_point):
	Define convenience functions.
	(read_utf8_code_point): Return relevant constant to distinguish
	incomplete characters from invalid sequences.
	(read_utf16_code_point): Likewise. Check for invalid sequences.
	(ucs4_in, utf16_in): Use incomplete_mb_character constant.
	(utf16_out): Check for invalid sequences.
	(utf16_span): Fix condition.
	(ucs2_out): Use is_high_surrogate.
	(ucs2_in): Use incomplete_mb_character constant and fix condition.
	* testsuite/22_locale/codecvt/char16_t.cc: Fix whitespace.
	* testsuite/22_locale/conversions/buffer/1.cc: New.
	* testsuite/22_locale/conversions/string/2.cc: Use char16_t and
	char32_t instead of wchar_t.
	* testsuite/22_locale/conversions/string/3.cc: New.

From-SVN: r221189
---
 libstdc++-v3/ChangeLog                        |  25 ++++
 libstdc++-v3/include/bits/locale_conv.h       |   8 +-
 libstdc++-v3/include/std/codecvt              |   4 +-
 libstdc++-v3/src/c++11/codecvt.cc             | 113 +++++++++++-------
 .../testsuite/22_locale/codecvt/char16_t.cc   |   3 +-
 .../22_locale/conversions/buffer/1.cc         |  78 ++++++++++++
 .../22_locale/conversions/string/2.cc         |  34 ++++--
 .../22_locale/conversions/string/3.cc         |  61 ++++++++++
 8 files changed, 272 insertions(+), 54 deletions(-)
 create mode 100644 libstdc++-v3/testsuite/22_locale/conversions/buffer/1.cc
 create mode 100644 libstdc++-v3/testsuite/22_locale/conversions/string/3.cc

diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog
index 265fc7a4835..16760dade1e 100644
--- a/libstdc++-v3/ChangeLog
+++ b/libstdc++-v3/ChangeLog
@@ -1,3 +1,28 @@
+2015-03-04  Jonathan Wakely  <jwakely@redhat.com>
+
+	PR libstdc++/64797
+	* include/bits/locale_conv.h (wstring_convert::_M_conv): Handle
+	incomplete multibyte sequences correctly.
+	* include/std/codecvt (codecvt_utf8, codecvt_utf16,
+	codecvt_utf8_utf16): Limit _Maxcode to maximum Unicode code point.
+	* src/c++11/codecvt.cc (invalid_mb_sequence, incomplete_mb_character):
+	Define constants.
+	(is_high_surrogate, is_low_surrogate, surrogate_pair_to_code_point):
+	Define convenience functions.
+	(read_utf8_code_point): Return relevant constant to distinguish
+	incomplete characters from invalid sequences.
+	(read_utf16_code_point): Likewise. Check for invalid sequences.
+	(ucs4_in, utf16_in): Use incomplete_mb_character constant.
+	(utf16_out): Check for invalid sequences.
+	(utf16_span): Fix condition.
+	(ucs2_out): Use is_high_surrogate.
+	(ucs2_in): Use incomplete_mb_character constant and fix condition.
+	* testsuite/22_locale/codecvt/char16_t.cc: Fix whitespace.
+	* testsuite/22_locale/conversions/buffer/1.cc: New.
+	* testsuite/22_locale/conversions/string/2.cc: Use char16_t and
+	char32_t instead of wchar_t.
+	* testsuite/22_locale/conversions/string/3.cc: New.
+
 2015-03-03  Iain Sandoe  <iain@codesourcery.com>
 
 	PR libstdc++/64883
diff --git a/libstdc++-v3/include/bits/locale_conv.h b/libstdc++-v3/include/bits/locale_conv.h
index c8a44f42421..b53754d1541 100644
--- a/libstdc++-v3/include/bits/locale_conv.h
+++ b/libstdc++-v3/include/bits/locale_conv.h
@@ -198,18 +198,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	  auto __outstr = __err ? _OutStr(__err->get_allocator()) : _OutStr();
 	  size_t __outchars = 0;
 	  auto __next = __first;
+	  const auto __maxlen = _M_cvt->max_length();
 
 	  codecvt_base::result __result;
 	  do
 	    {
-	      __outstr.resize(__outstr.size() + (__last - __next));
+	      __outstr.resize(__outstr.size() + (__last - __next) + __maxlen);
 	      auto __outnext = &__outstr.front() + __outchars;
 	      auto const __outlast = &__outstr.back() + 1;
 	      __result = ((*_M_cvt).*__memfn)(_M_state, __next, __last, __next,
 					    __outnext, __outlast, __outnext);
 	      __outchars = __outnext - &__outstr.front();
 	    }
-	  while (__result == codecvt_base::partial && __next != __last);
+	  while (__result == codecvt_base::partial && __next != __last
+		 && (__outstr.size() - __outchars) < __maxlen);
 
 	  __outstr.resize(__outchars);
 	  _M_count = __next - __first;
@@ -428,7 +430,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	      return _M_put(__next, __pending);
 
 	    if (!_M_put(__outbuf, __outnext - __outbuf))
-		return false;
+	      return false;
 	  }
 	while (__next != __last && __next != __start);
 
diff --git a/libstdc++-v3/include/std/codecvt b/libstdc++-v3/include/std/codecvt
index d58a0ecd673..e4a7d5bbb60 100644
--- a/libstdc++-v3/include/std/codecvt
+++ b/libstdc++-v3/include/std/codecvt
@@ -148,7 +148,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
     public: \
       explicit \
       _NAME(size_t __refs = 0) \
-      : __ ## _NAME ## _base<_ELEM>(_Maxcode, _Mode, __refs) { } \
+      : __ ## _NAME ## _base<_ELEM>(std::min(_Maxcode, 0x10fffful), \
+				    _Mode, __refs) \
+      { } \
     }
 
   template<typename _Elem> class __codecvt_utf8_base;
diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc
index aebd3f34986..83ee6e06831 100644
--- a/libstdc++-v3/src/c++11/codecvt.cc
+++ b/libstdc++-v3/src/c++11/codecvt.cc
@@ -35,8 +35,14 @@ namespace
 {
   // Largest code point that fits in a single UTF-16 code unit.
   const char32_t max_single_utf16_unit = 0xFFFF;
+
   const char32_t max_code_point = 0x10FFFF;
 
+  // The functions below rely on maxcode < incomplete_mb_character
+  // (which is enforced by the codecvt_utf* classes on construction).
+  const char32_t incomplete_mb_character = char32_t(-2);
+  const char32_t invalid_mb_sequence = char32_t(-1);
+
   template<typename Elem>
     struct range
     {
@@ -131,13 +137,13 @@ namespace
 
   // Read a codepoint from a UTF-8 multibyte sequence.
   // Updates from.next if the codepoint is not greater than maxcode.
-  // Returns -1 if there is an invalid or incomplete multibyte character.
+  // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
   char32_t
   read_utf8_code_point(range<const char>& from, unsigned long maxcode)
   {
-    size_t avail = from.size();
+    const size_t avail = from.size();
     if (avail == 0)
-      return -1;
+      return incomplete_mb_character;
     unsigned char c1 = from.next[0];
     // https://en.wikipedia.org/wiki/UTF-8#Sample_code
     if (c1 < 0x80)
@@ -146,14 +152,14 @@ namespace
       return c1;
     }
     else if (c1 < 0xC2) // continuation or overlong 2-byte sequence
-      return -1;
+      return invalid_mb_sequence;
     else if (c1 < 0xE0) // 2-byte sequence
     {
       if (avail < 2)
-	return -1;
+	return incomplete_mb_character;
       unsigned char c2 = from.next[1];
       if ((c2 & 0xC0) != 0x80)
-	return -1;
+	return invalid_mb_sequence;
       char32_t c = (c1 << 6) + c2 - 0x3080;
       if (c <= maxcode)
 	from.next += 2;
@@ -162,15 +168,15 @@ namespace
     else if (c1 < 0xF0) // 3-byte sequence
     {
       if (avail < 3)
-	return -1;
+	return incomplete_mb_character;
       unsigned char c2 = from.next[1];
       if ((c2 & 0xC0) != 0x80)
-	return -1;
+	return invalid_mb_sequence;
       if (c1 == 0xE0 && c2 < 0xA0) // overlong
-	return -1;
+	return invalid_mb_sequence;
       unsigned char c3 = from.next[2];
       if ((c3 & 0xC0) != 0x80)
-	return -1;
+	return invalid_mb_sequence;
       char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080;
       if (c <= maxcode)
 	from.next += 3;
@@ -179,27 +185,27 @@ namespace
     else if (c1 < 0xF5) // 4-byte sequence
     {
       if (avail < 4)
-	return -1;
+	return incomplete_mb_character;
       unsigned char c2 = from.next[1];
       if ((c2 & 0xC0) != 0x80)
-	return -1;
+	return invalid_mb_sequence;
       if (c1 == 0xF0 && c2 < 0x90) // overlong
-	return -1;
+	return invalid_mb_sequence;
       if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF
-      return -1;
+      return invalid_mb_sequence;
       unsigned char c3 = from.next[2];
       if ((c3 & 0xC0) != 0x80)
-	return -1;
+	return invalid_mb_sequence;
       unsigned char c4 = from.next[3];
       if ((c4 & 0xC0) != 0x80)
-	return -1;
+	return invalid_mb_sequence;
       char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080;
       if (c <= maxcode)
 	from.next += 4;
       return c;
     }
     else // > U+10FFFF
-      return -1;
+      return invalid_mb_sequence;
   }
 
   bool
@@ -250,27 +256,54 @@ namespace
 #endif
   }
 
+  // Return true if c is a high-surrogate (aka leading) code point.
+  inline bool
+  is_high_surrogate(char32_t c)
+  {
+    return c >= 0xD800 && c <= 0xDBFF;
+  }
+
+  // Return true if c is a low-surrogate (aka trailing) code point.
+  inline bool
+  is_low_surrogate(char32_t c)
+  {
+    return c >= 0xDC00 && c <= 0xDFFF;
+  }
+
+  inline char32_t
+  surrogate_pair_to_code_point(char32_t high, char32_t low)
+  {
+    return (high << 10) + low - 0x35FDC00;
+  }
+
   // Read a codepoint from a UTF-16 multibyte sequence.
   // The sequence's endianness is indicated by (mode & little_endian).
   // Updates from.next if the codepoint is not greater than maxcode.
-  // Returns -1 if there is an incomplete multibyte character.
+  // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
   char32_t
   read_utf16_code_point(range<const char16_t>& from, unsigned long maxcode,
 			codecvt_mode mode)
   {
+    const size_t avail = from.size();
+    if (avail == 0)
+      return incomplete_mb_character;
     int inc = 1;
     char32_t c = adjust_byte_order(from.next[0], mode);
-    if (c >= 0xD800 && c <= 0xDBFF)
+    if (is_high_surrogate(c))
       {
-	if (from.size() < 2)
-	  return -1;
+	if (avail < 2)
+	  return incomplete_mb_character;
 	const char16_t c2 = adjust_byte_order(from.next[1], mode);
-	if (c2 >= 0xDC00 && c2 <= 0xDFFF)
+	if (is_low_surrogate(c2))
 	  {
-	    c = (c << 10) + c2 - 0x35FDC00;
+	    c = surrogate_pair_to_code_point(c, c2);
 	    inc = 2;
 	  }
+	else
+	  return invalid_mb_sequence;
       }
+    else if (is_low_surrogate(c))
+      return invalid_mb_sequence;
     if (c <= maxcode)
       from.next += inc;
     return c;
@@ -314,8 +347,8 @@ namespace
     while (from.size() && to.size())
       {
 	const char32_t codepoint = read_utf8_code_point(from, maxcode);
-	if (codepoint == char32_t(-1))
-	  break;
+	if (codepoint == incomplete_mb_character)
+	  return codecvt_base::partial;
 	if (codepoint > maxcode)
 	  return codecvt_base::error;
 	*to.next++ = codepoint;
@@ -352,8 +385,8 @@ namespace
     while (from.size() && to.size())
       {
 	const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
-	if (codepoint == char32_t(-1))
-	  break;
+	if (codepoint == incomplete_mb_character)
+	  return codecvt_base::partial;
 	if (codepoint > maxcode)
 	  return codecvt_base::error;
 	*to.next++ = codepoint;
@@ -389,11 +422,9 @@ namespace
     read_utf8_bom(from, mode);
     while (from.size() && to.size())
       {
-	const char* first = from.next;
-	if ((unsigned char)*first >= 0xF0 && to.size() < 2)
-	  return codecvt_base::partial;
+	const char* const first = from.next;
 	const char32_t codepoint = read_utf8_code_point(from, maxcode);
-	if (codepoint == char32_t(-1))
+	if (codepoint == incomplete_mb_character)
 	  return codecvt_base::partial;
 	if (codepoint > maxcode)
 	  return codecvt_base::error;
@@ -418,20 +449,22 @@ namespace
       {
 	char32_t c = from.next[0];
 	int inc = 1;
-	if (c >= 0xD800 && c <= 0xDBFF) // start of surrogate pair
+	if (is_high_surrogate(c))
 	  {
 	    if (from.size() < 2)
 	      return codecvt_base::ok; // stop converting at this point
 
 	    const char32_t c2 = from.next[1];
-	    if (c2 >= 0xDC00 && c2 <= 0xDFFF)
+	    if (is_low_surrogate(c2))
 	      {
+		c = surrogate_pair_to_code_point(c, c2);
 		inc = 2;
-		c = (c << 10) + c2 - 0x35FDC00;
 	      }
 	    else
 	      return codecvt_base::error;
 	  }
+	else if (is_low_surrogate(c))
+	  return codecvt_base::error;
 	if (c > maxcode)
 	  return codecvt_base::error;
 	if (!write_utf8_code_point(to, c))
@@ -452,8 +485,8 @@ namespace
     while (count+1 < max)
       {
 	char32_t c = read_utf8_code_point(from, maxcode);
-	if (c == char32_t(-1))
-	  break;
+	if (c > maxcode)
+	  return from.next;
 	else if (c > max_single_utf16_unit)
 	  ++count;
 	++count;
@@ -489,7 +522,7 @@ namespace
     while (from.size() && to.size())
       {
 	char16_t c = from.next[0];
-	if (c >= 0xD800 && c <= 0xDBFF) // start of surrogate pair
+	if (is_high_surrogate(c))
 	  return codecvt_base::error;
 	if (c > maxcode)
 	  return codecvt_base::error;
@@ -510,9 +543,9 @@ namespace
     while (from.size() && to.size())
       {
 	const char32_t c = read_utf16_code_point(from, maxcode, mode);
-	if (c == char32_t(-1))
-	  break;
-	if (c >= maxcode)
+	if (c == incomplete_mb_character)
+	  return codecvt_base::partial;
+	if (c > maxcode)
 	  return codecvt_base::error;
 	*to.next++ = c;
       }
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc b/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc
index 9271eca89ec..a21a8385629 100644
--- a/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc
@@ -79,8 +79,7 @@ test01()
 
     codecvt_c16::state_type state01;
     state01 = {};
-    codecvt_base::result res = cvt->out(state01, u16dat, u16dat_end,
-from_next,
+    codecvt_base::result res = cvt->out(state01, u16dat, u16dat_end, from_next,
                                         buffer, buffer_end, to_next);
 
     VERIFY(res == codecvt_base::ok);
diff --git a/libstdc++-v3/testsuite/22_locale/conversions/buffer/1.cc b/libstdc++-v3/testsuite/22_locale/conversions/buffer/1.cc
new file mode 100644
index 00000000000..f008f5a1111
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/conversions/buffer/1.cc
@@ -0,0 +1,78 @@
+// { dg-options "-std=gnu++11" }
+
+// Copyright (C) 2012 Free Software Foundation
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// 22.3.3.2.3  Buffer conversions
+
+#include <locale>
+#include <sstream>
+#include <testsuite_hooks.h>
+
+template<typename Elem>
+struct cvt : std::codecvt<Elem, char, std::mbstate_t> { };
+
+template<typename Elem>
+using buf_conv = std::wbuffer_convert<cvt<Elem>, Elem>;
+
+using std::string;
+using std::stringstream;
+using std::wstring;
+using std::wstringstream;
+
+void test01()
+{
+  buf_conv<wchar_t> buf;
+  std::stringbuf sbuf;
+  VERIFY( buf.rdbuf() == nullptr );
+  VERIFY( buf.rdbuf(&sbuf) == nullptr );
+  VERIFY( buf.rdbuf() == &sbuf );
+  VERIFY( buf.rdbuf(nullptr) == &sbuf );
+}
+
+void test02()
+{
+  std::stringbuf sbuf;
+  buf_conv<char> buf(&sbuf);  // noconv
+
+  stringstream ss;
+  ss.std::ios::rdbuf(&buf);
+  string input = "King for a day...";
+  ss << input << std::flush;
+  string output = sbuf.str();
+  VERIFY( input == output );
+}
+
+void test03()
+{
+  std::stringbuf sbuf;
+  buf_conv<wchar_t> buf(&sbuf);
+
+  wstringstream ss;
+  ss.std::wios::rdbuf(&buf);
+  wstring input = L"Fool for a lifetime";
+  ss << input << std::flush;
+  string output = sbuf.str();
+  VERIFY( output == "Fool for a lifetime" );
+}
+
+int main()
+{
+  test01();
+  test02();
+  test03();
+}
diff --git a/libstdc++-v3/testsuite/22_locale/conversions/string/2.cc b/libstdc++-v3/testsuite/22_locale/conversions/string/2.cc
index 94eb75f9b00..07d2b520e97 100644
--- a/libstdc++-v3/testsuite/22_locale/conversions/string/2.cc
+++ b/libstdc++-v3/testsuite/22_locale/conversions/string/2.cc
@@ -30,26 +30,43 @@ template<typename Elem>
 using str_conv = std::wstring_convert<cvt<Elem>, Elem>;
 
 using std::string;
-using std::wstring;
+using std::u16string;
+using std::u32string;
 
 // test conversion errors, with and without error strings
 
 void test01()
 {
-  typedef str_conv<wchar_t> sc;
+  typedef str_conv<char16_t> sc;
 
   const sc::byte_string berr = "invalid wide string";
-  const sc::wide_string werr = L"invalid byte string";
+  const sc::wide_string werr = u"invalid byte string";
 
   sc c(berr, werr);
   string input = "Stop";
+  input += char(0xFF);
+  u16string woutput = c.from_bytes(input);
+  VERIFY( werr == woutput );
+  u16string winput = u"Stop";
+  winput += char16_t(0xDC00);
+  string output = c.to_bytes(winput);
+  VERIFY( berr == output );
+}
+
+void test02()
+{
+  typedef str_conv<char32_t> sc;
+
+  const sc::byte_string berr = "invalid wide string";
+  const sc::wide_string werr = U"invalid byte string";
+
+  sc c(berr, werr);
+  string input = "Halt";
   input += char(0xff);
-  input += char(0xff);
-  wstring woutput = c.from_bytes(input);
+  u32string woutput = c.from_bytes(input);
   VERIFY( werr == woutput );
-  wstring winput = L"Stop";
-  winput += wchar_t(0xff);
-  winput += wchar_t(0xff);
+  u32string winput = U"Halt";
+  winput += char32_t(-1);
   string output = c.to_bytes(winput);
   VERIFY( berr == output );
 }
@@ -57,4 +74,5 @@ void test01()
 int main()
 {
   test01();
+  test02();
 }
diff --git a/libstdc++-v3/testsuite/22_locale/conversions/string/3.cc b/libstdc++-v3/testsuite/22_locale/conversions/string/3.cc
new file mode 100644
index 00000000000..7c4ac207cd8
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/conversions/string/3.cc
@@ -0,0 +1,61 @@
+// { dg-options "-std=gnu++11" }
+
+// Copyright (C) 2012 Free Software Foundation
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// 22.3.3.2.2  String conversions
+
+#include <locale>
+#include <string>
+#include <testsuite_hooks.h>
+
+template<typename Elem>
+struct cvt : std::codecvt<Elem, char, std::mbstate_t> { };
+
+template<typename Elem>
+using str_conv = std::wstring_convert<cvt<Elem>, Elem>;
+
+using std::string;
+using std::u32string;
+
+// test construction with state, for partial conversions
+
+void test01()
+{
+  typedef str_conv<char32_t> wsc;
+
+  wsc c;
+  string input = u8"\u00a3 shillings pence";
+  u32string woutput = c.from_bytes(input.substr(0, 1));
+  auto partial_state = c.state();
+  auto partial_count = c.converted();
+
+  auto woutput2 = c.from_bytes("state reset on next conversion");
+  VERIFY( woutput2 == U"state reset on next conversion" );
+
+  wsc c2(new cvt<char32_t>, partial_state);
+  woutput += c2.from_bytes(input.substr(partial_count));
+  VERIFY( U"\u00a3 shillings pence" == woutput );
+
+  string roundtrip = c2.to_bytes(woutput);
+  VERIFY( input == roundtrip );
+}
+
+int main()
+{
+  test01();
+}
-- 
2.30.2