2017-03-16 Jonathan Wakely <jwakely@redhat.com>
+ * src/c++11/codecvt.cc (codecvt<char16_t, char, mbstate_t>)
+ (codecvt<char32_t, char, mbstate_t>, __codecvt_utf8_base<char16_t>)
+ (__codecvt_utf8_base<char32_t>, __codecvt_utf8_base<wchar_t>)
+ (__codecvt_utf16_base<char16_t>, __codecvt_utf16_base<char32_t>)
+ (__codecvt_utf16_base<wchar_t>, __codecvt_utf8_utf16_base<char16_t>)
+ (__codecvt_utf8_utf16_base<char32_t>)
+ (__codecvt_utf8_utf16_base<wchar_t>): Fix do_encoding() and
+ do_max_length() return values.
+ * testsuite/22_locale/codecvt/codecvt_utf16/members.cc: New test.
+ * testsuite/22_locale/codecvt/codecvt_utf8/members.cc: New test.
+ * testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc: New test.
+
PR libstdc++/79980
* include/bits/locale_conv.h (__do_str_codecvt): Set __count on
error path.
// Multibyte sequences can have "header" consisting of Byte Order Mark
const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF };
- const unsigned char utf16_bom[4] = { 0xFE, 0xFF };
- const unsigned char utf16le_bom[4] = { 0xFF, 0xFE };
+ const unsigned char utf16_bom[2] = { 0xFE, 0xFF };
+ const unsigned char utf16le_bom[2] = { 0xFF, 0xFE };
template<size_t N>
inline bool
int
codecvt<char16_t, char, mbstate_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
bool
codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw()
int
codecvt<char16_t, char, mbstate_t>::do_max_length() const throw()
{
- // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
- // whereas 4 byte sequences require two 16-bit code units.
- return 3;
+ // A single character (one or two UTF-16 code units) requires
+ // up to four UTF-8 code units.
+ return 4;
}
// Define members of codecvt<char32_t, char, mbstate_t> specialization.
int
codecvt<char32_t, char, mbstate_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
bool
codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw()
int
codecvt<char32_t, char, mbstate_t>::do_max_length() const throw()
-{ return 4; }
+{
+ // A single character (one UTF-32 code unit) requires
+ // up to 4 UTF-8 code units.
+ return 4;
+}
// Define members of codecvt_utf8<char16_t> base class implementation.
// Converts from UTF-8 to UCS-2.
int
__codecvt_utf8_base<char16_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
bool
__codecvt_utf8_base<char16_t>::do_always_noconv() const throw()
int
__codecvt_utf8_base<char16_t>::do_max_length() const throw()
-{ return 3; }
+{
+ // A single UCS-2 character requires up to three UTF-8 code units.
+ // (UCS-2 cannot represent characters that use four UTF-8 code units).
+ int max = 3;
+ if (_M_mode & consume_header)
+ max += sizeof(utf8_bom);
+ return max;
+}
// Define members of codecvt_utf8<char32_t> base class implementation.
// Converts from UTF-8 to UTF-32 (aka UCS-4).
int
__codecvt_utf8_base<char32_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
bool
__codecvt_utf8_base<char32_t>::do_always_noconv() const throw()
int
__codecvt_utf8_base<char32_t>::do_max_length() const throw()
-{ return 4; }
+{
+ // A single UCS-4 character requires up to four UTF-8 code units.
+ int max = 4;
+ if (_M_mode & consume_header)
+ max += sizeof(utf8_bom);
+ return max;
+}
#ifdef _GLIBCXX_USE_WCHAR_T
// Define members of codecvt_utf8<wchar_t> base class implementation.
int
__codecvt_utf8_base<wchar_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
bool
__codecvt_utf8_base<wchar_t>::do_always_noconv() const throw()
int
__codecvt_utf8_base<wchar_t>::do_max_length() const throw()
-{ return 4; }
+{
+#if __SIZEOF_WCHAR_T__ == 2
+ int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length()
+#else
+ int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length()
+#endif
+ if (_M_mode & consume_header)
+ max += sizeof(utf8_bom);
+ return max;
+}
#endif
// Define members of codecvt_utf16<char16_t> base class implementation.
int
__codecvt_utf16_base<char16_t>::do_encoding() const throw()
-{ return 1; }
+{ return 0; } // UTF-16 is not a fixed-width encoding
bool
__codecvt_utf16_base<char16_t>::do_always_noconv() const throw()
int
__codecvt_utf16_base<char16_t>::do_max_length() const throw()
-{ return 3; }
+{
+ // A single UCS-2 character requires one UTF-16 code unit (so two chars).
+ // (UCS-2 cannot represent characters that use multiple UTF-16 code units).
+ int max = 2;
+ if (_M_mode & consume_header)
+ max += sizeof(utf16_bom);
+ return max;
+}
// Define members of codecvt_utf16<char32_t> base class implementation.
// Converts from UTF-16 to UTF-32 (aka UCS-4).
int
__codecvt_utf16_base<char32_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-16 is not a fixed-width encoding
bool
__codecvt_utf16_base<char32_t>::do_always_noconv() const throw()
int
__codecvt_utf16_base<char32_t>::do_max_length() const throw()
-{ return 4; }
+{
+ // A single UCS-4 character requires one or two UTF-16 code units
+ // (so up to four chars).
+ int max = 4;
+ if (_M_mode & consume_header)
+ max += sizeof(utf16_bom);
+ return max;
+}
#ifdef _GLIBCXX_USE_WCHAR_T
// Define members of codecvt_utf16<wchar_t> base class implementation.
int
__codecvt_utf16_base<wchar_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-16 is not a fixed-width encoding
bool
__codecvt_utf16_base<wchar_t>::do_always_noconv() const throw()
int
__codecvt_utf16_base<wchar_t>::do_max_length() const throw()
-{ return 4; }
+{
+#if __SIZEOF_WCHAR_T__ == 2
+ int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length()
+#else
+ int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length()
+#endif
+ if (_M_mode & consume_header)
+ max += sizeof(utf16_bom);
+ return max;
+}
#endif
// Define members of codecvt_utf8_utf16<char16_t> base class implementation.
int
__codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
bool
__codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw()
int
__codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw()
{
- // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
- // whereas 4 byte sequences require two 16-bit code units.
- return 3;
+ // A single character can be 1 or 2 UTF-16 code units,
+ // requiring up to 4 UTF-8 code units.
+ int max = 4;
+ if (_M_mode & consume_header)
+ max += sizeof(utf8_bom);
+ return max;
}
// Define members of codecvt_utf8_utf16<char32_t> base class implementation.
int
__codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
bool
__codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw()
int
__codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw()
{
- // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
- // whereas 4 byte sequences require two 16-bit code units.
- return 3;
+ // A single character can be 1 or 2 UTF-16 code units,
+ // requiring up to 4 UTF-8 code units.
+ int max = 4;
+ if (_M_mode & consume_header)
+ max += sizeof(utf8_bom);
+ return max;
}
#ifdef _GLIBCXX_USE_WCHAR_T
int
__codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
bool
__codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw()
int
__codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw()
{
- // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
- // whereas 4 byte sequences require two 16-bit code units.
- return 3;
+ // A single character can be 1 or 2 UTF-16 code units,
+ // requiring up to 4 UTF-8 code units.
+ int max = 4;
+ if (_M_mode & consume_header)
+ max += sizeof(utf8_bom);
+ return max;
}
#endif
const codecvt_c16* const cvt = &use_facet<codecvt_c16>(loc_c);
VERIFY(!cvt->always_noconv());
- VERIFY(cvt->max_length() == 3);
+ VERIFY(cvt->max_length() == 4);
VERIFY(cvt->encoding() == 0);
const char u8dat[] = u8"H\U000000E4ll\U000000F6 \U0001F63F \U000056FD "
--- /dev/null
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+const int bomlen = 2; // UTF-16 BOM is 16 bits
+
+void
+test01()
+{
+ const int maxlen = 2;
+
+ std::codecvt_utf16<char16_t> c;
+ VERIFY( c.always_noconv() == false );
+ VERIFY( c.encoding() == 0 );
+ VERIFY( c.max_length() == maxlen );
+
+ std::codecvt_utf16<char16_t, 0x10ffff, std::consume_header> c_bom;
+ VERIFY( c_bom.always_noconv() == false );
+ VERIFY( c_bom.encoding() == 0 );
+ VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test02()
+{
+ const int maxlen = 4;
+
+ std::codecvt_utf16<char32_t> c;
+ VERIFY( c.always_noconv() == false );
+ VERIFY( c.encoding() == 0 );
+ VERIFY( c.max_length() == maxlen );
+
+ std::codecvt_utf16<char32_t, 0x10ffff, std::consume_header> c_bom;
+ VERIFY( c_bom.always_noconv() == false );
+ VERIFY( c_bom.encoding() == 0 );
+ VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test03()
+{
+#ifdef _GLIBCXX_USE_WCHAR_T
+ const int maxlen = sizeof(wchar_t) == 4 ? 4 : 2;
+
+ std::codecvt_utf16<wchar_t> c;
+ VERIFY( c.always_noconv() == false );
+ VERIFY( c.encoding() == 0 );
+ VERIFY( c.max_length() == maxlen );
+
+ std::codecvt_utf16<wchar_t, 0x10ffff, std::consume_header> c_bom;
+ VERIFY( c_bom.always_noconv() == false );
+ VERIFY( c_bom.encoding() == 0 );
+ VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+#endif
+}
+
+int
+main()
+{
+ test01();
+ test02();
+ test03();
+}
--- /dev/null
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+const int bomlen = 3; // UTF-8 BOM is 24 bits
+
+void
+test01()
+{
+ const int maxlen = 3;
+
+ std::codecvt_utf8<char16_t> c;
+ VERIFY( c.always_noconv() == false );
+ VERIFY( c.encoding() == 0 );
+ VERIFY( c.max_length() == maxlen );
+
+ std::codecvt_utf8<char16_t, 0x10ffff, std::consume_header> c_bom;
+ VERIFY( c_bom.always_noconv() == false );
+ VERIFY( c_bom.encoding() == 0 );
+ VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test02()
+{
+ const int maxlen = 4;
+
+ std::codecvt_utf8<char32_t> c;
+ VERIFY( c.always_noconv() == false );
+ VERIFY( c.encoding() == 0 );
+ VERIFY( c.max_length() == maxlen );
+
+ std::codecvt_utf8<char32_t, 0x10ffff, std::consume_header> c_bom;
+ VERIFY( c_bom.always_noconv() == false );
+ VERIFY( c_bom.encoding() == 0 );
+ VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test03()
+{
+#ifdef _GLIBCXX_USE_WCHAR_T
+ const int maxlen = sizeof(wchar_t) == 4 ? 4 : 3;
+
+ std::codecvt_utf8<wchar_t> c;
+ VERIFY( c.always_noconv() == false );
+ VERIFY( c.encoding() == 0 );
+ VERIFY( c.max_length() == maxlen );
+
+ std::codecvt_utf8<wchar_t, 0x10ffff, std::consume_header> c_bom;
+ VERIFY( c_bom.always_noconv() == false );
+ VERIFY( c_bom.encoding() == 0 );
+ VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+#endif
+}
+
+int
+main()
+{
+ test01();
+ test02();
+ test03();
+}
--- /dev/null
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+const int bomlen = 3; // UTF-8 BOM is 24 bits
+const int maxlen = 4;
+
+void
+test01()
+{
+ std::codecvt_utf8_utf16<char16_t> c;
+ VERIFY( c.always_noconv() == false );
+ VERIFY( c.encoding() == 0 );
+ VERIFY( c.max_length() == maxlen );
+
+ std::codecvt_utf8_utf16<char16_t, 0x10ffff, std::consume_header> c_bom;
+ VERIFY( c_bom.always_noconv() == false );
+ VERIFY( c_bom.encoding() == 0 );
+ VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test02()
+{
+ std::codecvt_utf8_utf16<char32_t> c;
+ VERIFY( c.always_noconv() == false );
+ VERIFY( c.encoding() == 0 );
+ VERIFY( c.max_length() == maxlen );
+
+ std::codecvt_utf8_utf16<char32_t, 0x10ffff, std::consume_header> c_bom;
+ VERIFY( c_bom.always_noconv() == false );
+ VERIFY( c_bom.encoding() == 0 );
+ VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test03()
+{
+#ifdef _GLIBCXX_USE_WCHAR_T
+ std::codecvt_utf8_utf16<wchar_t> c;
+ VERIFY( c.always_noconv() == false );
+ VERIFY( c.encoding() == 0 );
+ VERIFY( c.max_length() == maxlen );
+
+ std::codecvt_utf8_utf16<wchar_t, 0x10ffff, std::consume_header> c_bom;
+ VERIFY( c_bom.always_noconv() == false );
+ VERIFY( c_bom.encoding() == 0 );
+ VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+#endif
+}
+
+int
+main()
+{
+ test01();
+ test02();
+ test03();
+}