#include <codecvt>
#include <cstring> // std::memcpy, std::memcmp
-#include <bits/stl_algobase.h> // std::max
+#include <bits/stl_algobase.h> // std::min
#ifdef _GLIBCXX_USE_C99_STDINT_TR1
namespace std _GLIBCXX_VISIBILITY(default)
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
+ // The standard doesn't define these operators, which is annoying.
+ static underlying_type<codecvt_mode>::type
+ to_integer(codecvt_mode m)
+ { return static_cast<mode_t>(m); }
+
+ static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
+ { return m = codecvt_mode(to_integer(m) & to_integer(n)); }
+
+ static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
+ { return m = codecvt_mode(to_integer(m) | to_integer(n)); }
+
+ static codecvt_mode operator~(codecvt_mode m)
+ { return codecvt_mode(~to_integer(m)); }
+
namespace
{
// Largest code point that fits in a single UTF-16 code unit.
read_bom(from, utf8_bom);
}
- // If consume_header is set in mode update from.next to after any BOM.
- // Return little_endian iff the UTF-16LE BOM was present.
- codecvt_mode
- read_utf16_bom(range<const char16_t>& from, codecvt_mode mode)
+ // If consume_header is not set in mode, no effects.
+ // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
+ // - if the UTF-16BE BOM was found unset little_endian in mode, or
+ // - if the UTF-16LE BOM was found set little_endian in mode.
+ void
+ read_utf16_bom(range<const char16_t>& from, codecvt_mode& mode)
{
if (mode & consume_header && from.size())
{
- if (*from.next == 0xFEFF)
- ++from.next;
- else if (*from.next == 0xFFFE)
+ if (!memcmp(from.next, utf16_bom, 2))
+ {
+ ++from.next;
+ mode &= ~little_endian;
+ }
+ else if (!memcmp(from.next, utf16le_bom, 2))
{
++from.next;
- return little_endian;
+ mode |= little_endian;
}
}
- return {};
}
// Read a codepoint from a UTF-8 multibyte sequence.
ucs4_in(range<const char16_t>& from, range<char32_t>& to,
unsigned long maxcode = max_code_point, codecvt_mode mode = {})
{
- if (read_utf16_bom(from, mode) == little_endian)
- mode = codecvt_mode(mode & little_endian);
+ read_utf16_bom(from, mode);
while (from.size() && to.size())
{
const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
return codecvt_base::ok;
}
- // utf8 -> utf16
+ // Flag indicating whether to process UTF-16 or UCS2
+ enum class surrogates { allowed, disallowed };
+
+ // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
template<typename C>
codecvt_base::result
utf16_in(range<const char>& from, range<C>& to,
- unsigned long maxcode = max_code_point, codecvt_mode mode = {})
+ unsigned long maxcode = max_code_point, codecvt_mode mode = {},
+ surrogates s = surrogates::allowed)
{
read_utf8_bom(from, mode);
while (from.size() && to.size())
const char* const first = from.next;
const char32_t codepoint = read_utf8_code_point(from, maxcode);
if (codepoint == incomplete_mb_character)
- return codecvt_base::partial;
+ {
+ if (s == surrogates::allowed)
+ return codecvt_base::partial;
+ else
+ return codecvt_base::error; // No surrogates in UCS2
+ }
if (codepoint > maxcode)
return codecvt_base::error;
if (!write_utf16_code_point(to, codepoint, mode))
return codecvt_base::ok;
}
- // utf16 -> utf8
+ // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
template<typename C>
codecvt_base::result
utf16_out(range<const C>& from, range<char>& to,
- unsigned long maxcode = max_code_point, codecvt_mode mode = {})
+ unsigned long maxcode = max_code_point, codecvt_mode mode = {},
+ surrogates s = surrogates::allowed)
{
if (!write_utf8_bom(to, mode))
return codecvt_base::partial;
int inc = 1;
if (is_high_surrogate(c))
{
+ if (s == surrogates::disallowed)
+ return codecvt_base::error; // No surrogates in UCS-2
+
if (from.size() < 2)
return codecvt_base::ok; // stop converting at this point
++count;
}
if (count+1 == max) // take one more character if it fits in a single unit
- read_utf8_code_point(from, std::max(max_single_utf16_unit, maxcode));
+ read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
return from.next;
}
ucs2_in(range<const char>& from, range<char16_t>& to,
char32_t maxcode = max_code_point, codecvt_mode mode = {})
{
- return utf16_in(from, to, std::max(max_single_utf16_unit, maxcode), mode);
+ // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+ maxcode = std::min(max_single_utf16_unit, maxcode);
+ return utf16_in(from, to, maxcode, mode, surrogates::disallowed);
}
// ucs2 -> utf8
ucs2_out(range<const char16_t>& from, range<char>& to,
char32_t maxcode = max_code_point, codecvt_mode mode = {})
{
- return utf16_out(from, to, std::max(max_single_utf16_unit, maxcode), mode);
+ // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+ maxcode = std::min(max_single_utf16_unit, maxcode);
+ return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
}
// ucs2 -> utf16
ucs2_in(range<const char16_t>& from, range<char16_t>& to,
char32_t maxcode = max_code_point, codecvt_mode mode = {})
{
- if (read_utf16_bom(from, mode) == little_endian)
- mode = codecvt_mode(mode & little_endian);
- maxcode = std::max(max_single_utf16_unit, maxcode);
+ read_utf16_bom(from, mode);
+ // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+ maxcode = std::min(max_single_utf16_unit, maxcode);
while (from.size() && to.size())
{
const char32_t c = read_utf16_code_point(from, maxcode, mode);
if (c == incomplete_mb_character)
- return codecvt_base::partial;
+ return codecvt_base::error; // UCS-2 only supports single units.
if (c > maxcode)
return codecvt_base::error;
*to.next++ = c;
char32_t maxcode, codecvt_mode mode)
{
range<const char16_t> from{ begin, end };
- if (read_utf16_bom(from, mode) == little_endian)
- mode = codecvt_mode(mode & little_endian);
- maxcode = std::max(max_single_utf16_unit, maxcode);
+ read_utf16_bom(from, mode);
+ // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+ maxcode = std::min(max_single_utf16_unit, maxcode);
char32_t c = 0;
while (max-- && c <= maxcode)
c = read_utf16_code_point(from, maxcode, mode);
{
range<const char> from{ begin, end };
read_utf8_bom(from, mode);
- maxcode = std::max(max_single_utf16_unit, maxcode);
+ // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+ maxcode = std::min(max_single_utf16_unit, maxcode);
char32_t c = 0;
while (max-- && c <= maxcode)
c = read_utf8_code_point(from, maxcode);
char32_t maxcode = max_code_point, codecvt_mode mode = {})
{
range<const char16_t> from{ begin, end };
- if (read_utf16_bom(from, mode) == little_endian)
- mode = codecvt_mode(mode & little_endian);
+ read_utf16_bom(from, mode);
char32_t c = 0;
while (max-- && c <= maxcode)
c = read_utf16_code_point(from, maxcode, mode);
--- /dev/null
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <locale>
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+// PR libstdc++/79980
+
+constexpr std::codecvt_mode mode(std::codecvt_mode m)
+{ return static_cast<std::codecvt_mode>(m | std::consume_header); }
+
+template<typename WCh, unsigned long Max = 0x10FFFF,
+ std::codecvt_mode Mode = std::consume_header>
+ using Conv
+ = std::wstring_convert<std::codecvt_utf16<WCh, Max, mode(Mode)>, WCh>;
+
+void
+test01()
+{
+ const char src[] = "\xFE\xFF\xAB\xCD";
+ Conv<char16_t> conv;
+ auto dst = conv.from_bytes(src, src+4);
+ VERIFY( dst[0] == 0xabcd );
+}
+
+void
+test02()
+{
+ const char src[] = "\xFF\xFE\xAB\xCD";
+ Conv<char16_t> conv;
+ auto dst = conv.from_bytes(src, src+4);
+ VERIFY( dst[0] == 0xcdab );
+}
+
+void
+test03()
+{
+ const char src[] = "\xFE\xFF\xAB\xCD";
+ Conv<char16_t, 0x10FFFF, std::little_endian> conv;
+ auto dst = conv.from_bytes(src, src+4);
+ VERIFY( dst[0] == 0xabcd );
+}
+
+void
+test04()
+{
+ const char src[] = "\xFF\xFE\xAB\xCD";
+ Conv<char16_t, 0x10FFFF, std::little_endian> conv;
+ auto dst = conv.from_bytes(src, src+4);
+ VERIFY( dst[0] == 0xcdab );
+}
+
+void
+test05()
+{
+ const char src[] = "\0\x61\xAB\xCD"; // character greater than 0x00FF
+ Conv<char16_t, 0xFF> conv("to_bytes failed", u"from_bytes failed");
+ std::u16string result = conv.from_bytes(src, src+4);
+ VERIFY( result == u"from_bytes failed" );
+ VERIFY( conv.converted() == 2 );
+}
+
+void
+test06()
+{
+ const char src[] = "\0\x61\xAB\xCD";
+ Conv<char16_t> conv("to_bytes failed", u"from_bytes failed");
+ std::u16string result = conv.from_bytes(src, src+3); // incomplete character
+ VERIFY( result == u"from_bytes failed" );
+ VERIFY( conv.converted() == 2 );
+}
+
+void
+test07()
+{
+ Conv<char16_t> conv("to_bytes failed", u"from_bytes failed");
+ // ucs2 to utf-16 conversion should fail on invalid ucs2 input:
+ std::u16string utf16 = u"1234\U00001111\U0001ffff";
+ auto out = conv.to_bytes(utf16);
+ VERIFY( out == "to_bytes failed" );
+ VERIFY( conv.converted() == 5 );
+
+ // And should also fail on incomplete surrogate pair (not return partial):
+ out = conv.to_bytes(utf16.substr(0, utf16.size()-1));
+ VERIFY( out == "to_bytes failed" );
+ VERIFY( conv.converted() == 5 );
+}
+
+int main()
+{
+ test01();
+ test02();
+ test03();
+ test04();
+ test05();
+ test06();
+ test07();
+}
--- /dev/null
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <codecvt>
+#include <locale>
+#include <string>
+#include <testsuite_hooks.h>
+
+using std::wstring_convert;
+using std::codecvt_utf8;
+
+void
+test01()
+{
+ std::string src = u8"1234\U00001111\U0001ffff";
+ wstring_convert<codecvt_utf8<char16_t>, char16_t> c("bad", u"BAD");
+
+ // utf-8 to ucs2 conversion should fail on character outside BMP
+ auto ucs2 = c.from_bytes(src);
+ VERIFY( ucs2 == u"BAD" );
+ VERIFY( c.converted() == 7 );
+
+ // ucs2 to utf-8 conversion should fail on invalid ucs2 input:
+ std::u16string utf16 = u"1234\U00001111\U0001ffff";
+ auto out = c.to_bytes(utf16);
+ VERIFY( out == "bad" );
+ VERIFY( c.converted() == 5 );
+
+ // And should also fail on incomplete surrogate pair (not return partial):
+ out = c.to_bytes(utf16.substr(0, utf16.size()-1));
+ VERIFY( out == "bad" );
+ VERIFY( c.converted() == 5 );
+}
+
+void
+test02()
+{
+ std::string src = u8"1234\U00001111\U0001ffff";
+ wstring_convert<codecvt_utf8<char16_t, 0x1000>, char16_t> c("bad", u"BAD");
+
+ // utf-8 to ucs2 conversion should fail on character above Maxcode=0x1000
+ auto ucs2 = c.from_bytes(src);
+ VERIFY( ucs2 == u"BAD" );
+ VERIFY( c.converted() == 4 );
+}
+
+void
+test03()
+{
+ std::string src = u8"1234\U00001111\U0001ffff";
+ wstring_convert<codecvt_utf8<char32_t, 0x10000>, char32_t> c("bad", U"BAD");
+
+ // utf-8 to ucs4 conversion should fail on character above Maxcode=0x10000
+ auto ucs4 = c.from_bytes(src);
+ VERIFY( ucs4 == U"BAD" );
+ VERIFY( c.converted() == 7 );
+}
+
+void
+test04()
+{
+ std::string src = u8"1234\U00001111\U0001ffff";
+ wstring_convert<codecvt_utf8<char32_t, 0x1000>, char32_t> c("bad", U"BAD");
+
+ // utf-8 to ucs4 conversion should fail on character above Maxcode=0x1000
+ auto ucs4 = c.from_bytes(src);
+ VERIFY( ucs4 == U"BAD" );
+ VERIFY( c.converted() == 4 );
+}
+
+int
+main()
+{
+ test01();
+ test02();
+ test03();
+ test04();
+}