1 // Locale support (codecvt) -*- C++ -*-
3 // Copyright (C) 2015-2016 Free Software Foundation, Inc.
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
26 #include <cstring> // std::memcpy, std::memcmp
27 #include <bits/stl_algobase.h> // std::max
29 #ifdef _GLIBCXX_USE_C99_STDINT_TR1
30 namespace std
_GLIBCXX_VISIBILITY(default)
32 _GLIBCXX_BEGIN_NAMESPACE_VERSION
36 // Largest code point that fits in a single UTF-16 code unit.
37 const char32_t max_single_utf16_unit
= 0xFFFF;
39 const char32_t max_code_point
= 0x10FFFF;
41 // The functions below rely on maxcode < incomplete_mb_character
42 // (which is enforced by the codecvt_utf* classes on construction).
43 const char32_t incomplete_mb_character
= char32_t(-2);
44 const char32_t invalid_mb_sequence
= char32_t(-1);
46 template<typename Elem
>
52 Elem
operator*() const { return *next
; }
54 range
& operator++() { ++next
; return *this; }
56 size_t size() const { return end
- next
; }
59 // Multibyte sequences can have "header" consisting of Byte Order Mark
60 const unsigned char utf8_bom
[3] = { 0xEF, 0xBB, 0xBF };
61 const unsigned char utf16_bom
[4] = { 0xFE, 0xFF };
62 const unsigned char utf16le_bom
[4] = { 0xFF, 0xFE };
66 write_bom(range
<char>& to
, const unsigned char (&bom
)[N
])
70 memcpy(to
.next
, bom
, N
);
75 // If generate_header is set in mode write out UTF-8 BOM.
77 write_utf8_bom(range
<char>& to
, codecvt_mode mode
)
79 if (mode
& generate_header
)
80 return write_bom(to
, utf8_bom
);
84 // If generate_header is set in mode write out the UTF-16 BOM indicated
85 // by whether little_endian is set in mode.
87 write_utf16_bom(range
<char16_t
>& to
, codecvt_mode mode
)
89 if (mode
& generate_header
)
93 auto* bom
= (mode
& little_endian
) ? utf16le_bom
: utf16_bom
;
94 std::memcpy(to
.next
, bom
, 2);
102 read_bom(range
<const char>& from
, const unsigned char (&bom
)[N
])
104 if (from
.size() >= N
&& !memcmp(from
.next
, bom
, N
))
112 // If consume_header is set in mode update from.next to after any BOM.
114 read_utf8_bom(range
<const char>& from
, codecvt_mode mode
)
116 if (mode
& consume_header
)
117 read_bom(from
, utf8_bom
);
120 // If consume_header is set in mode update from.next to after any BOM.
121 // Return little_endian iff the UTF-16LE BOM was present.
123 read_utf16_bom(range
<const char16_t
>& from
, codecvt_mode mode
)
125 if (mode
& consume_header
&& from
.size())
127 if (*from
.next
== 0xFEFF)
129 else if (*from
.next
== 0xFFFE)
132 return little_endian
;
138 // Read a codepoint from a UTF-8 multibyte sequence.
139 // Updates from.next if the codepoint is not greater than maxcode.
140 // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
142 read_utf8_code_point(range
<const char>& from
, unsigned long maxcode
)
144 const size_t avail
= from
.size();
146 return incomplete_mb_character
;
147 unsigned char c1
= from
.next
[0];
148 // https://en.wikipedia.org/wiki/UTF-8#Sample_code
154 else if (c1
< 0xC2) // continuation or overlong 2-byte sequence
155 return invalid_mb_sequence
;
156 else if (c1
< 0xE0) // 2-byte sequence
159 return incomplete_mb_character
;
160 unsigned char c2
= from
.next
[1];
161 if ((c2
& 0xC0) != 0x80)
162 return invalid_mb_sequence
;
163 char32_t c
= (c1
<< 6) + c2
- 0x3080;
168 else if (c1
< 0xF0) // 3-byte sequence
171 return incomplete_mb_character
;
172 unsigned char c2
= from
.next
[1];
173 if ((c2
& 0xC0) != 0x80)
174 return invalid_mb_sequence
;
175 if (c1
== 0xE0 && c2
< 0xA0) // overlong
176 return invalid_mb_sequence
;
177 unsigned char c3
= from
.next
[2];
178 if ((c3
& 0xC0) != 0x80)
179 return invalid_mb_sequence
;
180 char32_t c
= (c1
<< 12) + (c2
<< 6) + c3
- 0xE2080;
185 else if (c1
< 0xF5) // 4-byte sequence
188 return incomplete_mb_character
;
189 unsigned char c2
= from
.next
[1];
190 if ((c2
& 0xC0) != 0x80)
191 return invalid_mb_sequence
;
192 if (c1
== 0xF0 && c2
< 0x90) // overlong
193 return invalid_mb_sequence
;
194 if (c1
== 0xF4 && c2
>= 0x90) // > U+10FFFF
195 return invalid_mb_sequence
;
196 unsigned char c3
= from
.next
[2];
197 if ((c3
& 0xC0) != 0x80)
198 return invalid_mb_sequence
;
199 unsigned char c4
= from
.next
[3];
200 if ((c4
& 0xC0) != 0x80)
201 return invalid_mb_sequence
;
202 char32_t c
= (c1
<< 18) + (c2
<< 12) + (c3
<< 6) + c4
- 0x3C82080;
208 return invalid_mb_sequence
;
212 write_utf8_code_point(range
<char>& to
, char32_t code_point
)
214 if (code_point
< 0x80)
218 *to
.next
++ = code_point
;
220 else if (code_point
<= 0x7FF)
224 *to
.next
++ = (code_point
>> 6) + 0xC0;
225 *to
.next
++ = (code_point
& 0x3F) + 0x80;
227 else if (code_point
<= 0xFFFF)
231 *to
.next
++ = (code_point
>> 12) + 0xE0;
232 *to
.next
++ = ((code_point
>> 6) & 0x3F) + 0x80;
233 *to
.next
++ = (code_point
& 0x3F) + 0x80;
235 else if (code_point
<= 0x10FFFF)
239 *to
.next
++ = (code_point
>> 18) + 0xF0;
240 *to
.next
++ = ((code_point
>> 12) & 0x3F) + 0x80;
241 *to
.next
++ = ((code_point
>> 6) & 0x3F) + 0x80;
242 *to
.next
++ = (code_point
& 0x3F) + 0x80;
250 adjust_byte_order(char16_t c
, codecvt_mode mode
)
252 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
253 return (mode
& little_endian
) ? __builtin_bswap16(c
) : c
;
255 return (mode
& little_endian
) ? c
: __builtin_bswap16(c
);
259 // Return true if c is a high-surrogate (aka leading) code point.
261 is_high_surrogate(char32_t c
)
263 return c
>= 0xD800 && c
<= 0xDBFF;
266 // Return true if c is a low-surrogate (aka trailing) code point.
268 is_low_surrogate(char32_t c
)
270 return c
>= 0xDC00 && c
<= 0xDFFF;
274 surrogate_pair_to_code_point(char32_t high
, char32_t low
)
276 return (high
<< 10) + low
- 0x35FDC00;
279 // Read a codepoint from a UTF-16 multibyte sequence.
280 // The sequence's endianness is indicated by (mode & little_endian).
281 // Updates from.next if the codepoint is not greater than maxcode.
282 // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
284 read_utf16_code_point(range
<const char16_t
>& from
, unsigned long maxcode
,
287 const size_t avail
= from
.size();
289 return incomplete_mb_character
;
291 char32_t c
= adjust_byte_order(from
.next
[0], mode
);
292 if (is_high_surrogate(c
))
295 return incomplete_mb_character
;
296 const char16_t c2
= adjust_byte_order(from
.next
[1], mode
);
297 if (is_low_surrogate(c2
))
299 c
= surrogate_pair_to_code_point(c
, c2
);
303 return invalid_mb_sequence
;
305 else if (is_low_surrogate(c
))
306 return invalid_mb_sequence
;
314 write_utf16_code_point(range
<C
>& to
, char32_t codepoint
, codecvt_mode mode
)
316 static_assert(sizeof(C
) >= 2, "a code unit must be at least 16-bit");
318 if (codepoint
< max_single_utf16_unit
)
322 *to
.next
= adjust_byte_order(codepoint
, mode
);
327 else if (to
.size() > 1)
329 // Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4
330 const char32_t LEAD_OFFSET
= 0xD800 - (0x10000 >> 10);
331 char16_t lead
= LEAD_OFFSET
+ (codepoint
>> 10);
332 char16_t trail
= 0xDC00 + (codepoint
& 0x3FF);
333 to
.next
[0] = adjust_byte_order(lead
, mode
);
334 to
.next
[1] = adjust_byte_order(trail
, mode
);
343 ucs4_in(range
<const char>& from
, range
<char32_t
>& to
,
344 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {})
346 read_utf8_bom(from
, mode
);
347 while (from
.size() && to
.size())
349 const char32_t codepoint
= read_utf8_code_point(from
, maxcode
);
350 if (codepoint
== incomplete_mb_character
)
351 return codecvt_base::partial
;
352 if (codepoint
> maxcode
)
353 return codecvt_base::error
;
354 *to
.next
++ = codepoint
;
356 return from
.size() ? codecvt_base::partial
: codecvt_base::ok
;
361 ucs4_out(range
<const char32_t
>& from
, range
<char>& to
,
362 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {})
364 if (!write_utf8_bom(to
, mode
))
365 return codecvt_base::partial
;
368 const char32_t c
= from
.next
[0];
370 return codecvt_base::error
;
371 if (!write_utf8_code_point(to
, c
))
372 return codecvt_base::partial
;
375 return codecvt_base::ok
;
380 ucs4_in(range
<const char16_t
>& from
, range
<char32_t
>& to
,
381 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {})
383 if (read_utf16_bom(from
, mode
) == little_endian
)
384 mode
= codecvt_mode(mode
& little_endian
);
385 while (from
.size() && to
.size())
387 const char32_t codepoint
= read_utf16_code_point(from
, maxcode
, mode
);
388 if (codepoint
== incomplete_mb_character
)
389 return codecvt_base::partial
;
390 if (codepoint
> maxcode
)
391 return codecvt_base::error
;
392 *to
.next
++ = codepoint
;
394 return from
.size() ? codecvt_base::partial
: codecvt_base::ok
;
399 ucs4_out(range
<const char32_t
>& from
, range
<char16_t
>& to
,
400 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {})
402 if (!write_utf16_bom(to
, mode
))
403 return codecvt_base::partial
;
406 const char32_t c
= from
.next
[0];
408 return codecvt_base::error
;
409 if (!write_utf16_code_point(to
, c
, mode
))
410 return codecvt_base::partial
;
413 return codecvt_base::ok
;
419 utf16_in(range
<const char>& from
, range
<C
>& to
,
420 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {})
422 read_utf8_bom(from
, mode
);
423 while (from
.size() && to
.size())
425 const char* const first
= from
.next
;
426 const char32_t codepoint
= read_utf8_code_point(from
, maxcode
);
427 if (codepoint
== incomplete_mb_character
)
428 return codecvt_base::partial
;
429 if (codepoint
> maxcode
)
430 return codecvt_base::error
;
431 if (!write_utf16_code_point(to
, codepoint
, mode
))
434 return codecvt_base::partial
;
437 return codecvt_base::ok
;
443 utf16_out(range
<const C
>& from
, range
<char>& to
,
444 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {})
446 if (!write_utf8_bom(to
, mode
))
447 return codecvt_base::partial
;
450 char32_t c
= from
.next
[0];
452 if (is_high_surrogate(c
))
455 return codecvt_base::ok
; // stop converting at this point
457 const char32_t c2
= from
.next
[1];
458 if (is_low_surrogate(c2
))
460 c
= surrogate_pair_to_code_point(c
, c2
);
464 return codecvt_base::error
;
466 else if (is_low_surrogate(c
))
467 return codecvt_base::error
;
469 return codecvt_base::error
;
470 if (!write_utf8_code_point(to
, c
))
471 return codecvt_base::partial
;
474 return codecvt_base::ok
;
477 // return pos such that [begin,pos) is valid UTF-16 string no longer than max
479 utf16_span(const char* begin
, const char* end
, size_t max
,
480 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
482 range
<const char> from
{ begin
, end
};
483 read_utf8_bom(from
, mode
);
485 while (count
+1 < max
)
487 char32_t c
= read_utf8_code_point(from
, maxcode
);
490 else if (c
> max_single_utf16_unit
)
494 if (count
+1 == max
) // take one more character if it fits in a single unit
495 read_utf8_code_point(from
, std::max(max_single_utf16_unit
, maxcode
));
501 ucs2_in(range
<const char>& from
, range
<char16_t
>& to
,
502 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
504 return utf16_in(from
, to
, std::max(max_single_utf16_unit
, maxcode
), mode
);
509 ucs2_out(range
<const char16_t
>& from
, range
<char>& to
,
510 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
512 return utf16_out(from
, to
, std::max(max_single_utf16_unit
, maxcode
), mode
);
517 ucs2_out(range
<const char16_t
>& from
, range
<char16_t
>& to
,
518 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
520 if (!write_utf16_bom(to
, mode
))
521 return codecvt_base::partial
;
522 while (from
.size() && to
.size())
524 char16_t c
= from
.next
[0];
525 if (is_high_surrogate(c
))
526 return codecvt_base::error
;
528 return codecvt_base::error
;
529 *to
.next
++ = adjust_byte_order(c
, mode
);
532 return from
.size() == 0 ? codecvt_base::ok
: codecvt_base::partial
;
537 ucs2_in(range
<const char16_t
>& from
, range
<char16_t
>& to
,
538 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
540 if (read_utf16_bom(from
, mode
) == little_endian
)
541 mode
= codecvt_mode(mode
& little_endian
);
542 maxcode
= std::max(max_single_utf16_unit
, maxcode
);
543 while (from
.size() && to
.size())
545 const char32_t c
= read_utf16_code_point(from
, maxcode
, mode
);
546 if (c
== incomplete_mb_character
)
547 return codecvt_base::partial
;
549 return codecvt_base::error
;
552 return from
.size() == 0 ? codecvt_base::ok
: codecvt_base::partial
;
556 ucs2_span(const char16_t
* begin
, const char16_t
* end
, size_t max
,
557 char32_t maxcode
, codecvt_mode mode
)
559 range
<const char16_t
> from
{ begin
, end
};
560 if (read_utf16_bom(from
, mode
) == little_endian
)
561 mode
= codecvt_mode(mode
& little_endian
);
562 maxcode
= std::max(max_single_utf16_unit
, maxcode
);
564 while (max
-- && c
<= maxcode
)
565 c
= read_utf16_code_point(from
, maxcode
, mode
);
570 ucs2_span(const char* begin
, const char* end
, size_t max
,
571 char32_t maxcode
, codecvt_mode mode
)
573 range
<const char> from
{ begin
, end
};
574 read_utf8_bom(from
, mode
);
575 maxcode
= std::max(max_single_utf16_unit
, maxcode
);
577 while (max
-- && c
<= maxcode
)
578 c
= read_utf8_code_point(from
, maxcode
);
582 // return pos such that [begin,pos) is valid UCS-4 string no longer than max
584 ucs4_span(const char* begin
, const char* end
, size_t max
,
585 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
587 range
<const char> from
{ begin
, end
};
588 read_utf8_bom(from
, mode
);
590 while (max
-- && c
<= maxcode
)
591 c
= read_utf8_code_point(from
, maxcode
);
595 // return pos such that [begin,pos) is valid UCS-4 string no longer than max
597 ucs4_span(const char16_t
* begin
, const char16_t
* end
, size_t max
,
598 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
600 range
<const char16_t
> from
{ begin
, end
};
601 if (read_utf16_bom(from
, mode
) == little_endian
)
602 mode
= codecvt_mode(mode
& little_endian
);
604 while (max
-- && c
<= maxcode
)
605 c
= read_utf16_code_point(from
, maxcode
, mode
);
610 // Define members of codecvt<char16_t, char, mbstate_t> specialization.
611 // Converts from UTF-8 to UTF-16.
613 locale::id codecvt
<char16_t
, char, mbstate_t>::id
;
615 codecvt
<char16_t
, char, mbstate_t>::~codecvt() { }
618 codecvt
<char16_t
, char, mbstate_t>::
620 const intern_type
* __from
,
621 const intern_type
* __from_end
, const intern_type
*& __from_next
,
622 extern_type
* __to
, extern_type
* __to_end
,
623 extern_type
*& __to_next
) const
625 range
<const char16_t
> from
{ __from
, __from_end
};
626 range
<char> to
{ __to
, __to_end
};
627 auto res
= utf16_out(from
, to
);
628 __from_next
= from
.next
;
634 codecvt
<char16_t
, char, mbstate_t>::
635 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
636 extern_type
*& __to_next
) const
639 return noconv
; // we don't use mbstate_t for the unicode facets
643 codecvt
<char16_t
, char, mbstate_t>::
644 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
645 const extern_type
*& __from_next
,
646 intern_type
* __to
, intern_type
* __to_end
,
647 intern_type
*& __to_next
) const
649 range
<const char> from
{ __from
, __from_end
};
650 range
<char16_t
> to
{ __to
, __to_end
};
651 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
652 codecvt_mode mode
= {};
654 codecvt_mode mode
= little_endian
;
656 auto res
= utf16_in(from
, to
, max_code_point
, mode
);
657 __from_next
= from
.next
;
663 codecvt
<char16_t
, char, mbstate_t>::do_encoding() const throw()
667 codecvt
<char16_t
, char, mbstate_t>::do_always_noconv() const throw()
671 codecvt
<char16_t
, char, mbstate_t>::
672 do_length(state_type
&, const extern_type
* __from
,
673 const extern_type
* __end
, size_t __max
) const
675 __end
= utf16_span(__from
, __end
, __max
);
676 return __end
- __from
;
680 codecvt
<char16_t
, char, mbstate_t>::do_max_length() const throw()
682 // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
683 // whereas 4 byte sequences require two 16-bit code units.
687 // Define members of codecvt<char32_t, char, mbstate_t> specialization.
688 // Converts from UTF-8 to UTF-32 (aka UCS-4).
690 locale::id codecvt
<char32_t
, char, mbstate_t>::id
;
692 codecvt
<char32_t
, char, mbstate_t>::~codecvt() { }
695 codecvt
<char32_t
, char, mbstate_t>::
696 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
697 const intern_type
*& __from_next
,
698 extern_type
* __to
, extern_type
* __to_end
,
699 extern_type
*& __to_next
) const
701 range
<const char32_t
> from
{ __from
, __from_end
};
702 range
<char> to
{ __to
, __to_end
};
703 auto res
= ucs4_out(from
, to
);
704 __from_next
= from
.next
;
710 codecvt
<char32_t
, char, mbstate_t>::
711 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
712 extern_type
*& __to_next
) const
719 codecvt
<char32_t
, char, mbstate_t>::
720 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
721 const extern_type
*& __from_next
,
722 intern_type
* __to
, intern_type
* __to_end
,
723 intern_type
*& __to_next
) const
725 range
<const char> from
{ __from
, __from_end
};
726 range
<char32_t
> to
{ __to
, __to_end
};
727 auto res
= ucs4_in(from
, to
);
728 __from_next
= from
.next
;
734 codecvt
<char32_t
, char, mbstate_t>::do_encoding() const throw()
738 codecvt
<char32_t
, char, mbstate_t>::do_always_noconv() const throw()
742 codecvt
<char32_t
, char, mbstate_t>::
743 do_length(state_type
&, const extern_type
* __from
,
744 const extern_type
* __end
, size_t __max
) const
746 __end
= ucs4_span(__from
, __end
, __max
);
747 return __end
- __from
;
751 codecvt
<char32_t
, char, mbstate_t>::do_max_length() const throw()
754 // Define members of codecvt_utf8<char16_t> base class implementation.
755 // Converts from UTF-8 to UCS-2.
757 __codecvt_utf8_base
<char16_t
>::~__codecvt_utf8_base() { }
760 __codecvt_utf8_base
<char16_t
>::
761 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
762 const intern_type
*& __from_next
,
763 extern_type
* __to
, extern_type
* __to_end
,
764 extern_type
*& __to_next
) const
766 range
<const char16_t
> from
{ __from
, __from_end
};
767 range
<char> to
{ __to
, __to_end
};
768 auto res
= ucs2_out(from
, to
, _M_maxcode
, _M_mode
);
769 __from_next
= from
.next
;
775 __codecvt_utf8_base
<char16_t
>::
776 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
777 extern_type
*& __to_next
) const
784 __codecvt_utf8_base
<char16_t
>::
785 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
786 const extern_type
*& __from_next
,
787 intern_type
* __to
, intern_type
* __to_end
,
788 intern_type
*& __to_next
) const
790 range
<const char> from
{ __from
, __from_end
};
791 range
<char16_t
> to
{ __to
, __to_end
};
792 auto res
= ucs2_in(from
, to
, _M_maxcode
, _M_mode
);
793 __from_next
= from
.next
;
799 __codecvt_utf8_base
<char16_t
>::do_encoding() const throw()
803 __codecvt_utf8_base
<char16_t
>::do_always_noconv() const throw()
807 __codecvt_utf8_base
<char16_t
>::
808 do_length(state_type
&, const extern_type
* __from
,
809 const extern_type
* __end
, size_t __max
) const
811 __end
= ucs2_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
812 return __end
- __from
;
816 __codecvt_utf8_base
<char16_t
>::do_max_length() const throw()
819 // Define members of codecvt_utf8<char32_t> base class implementation.
820 // Converts from UTF-8 to UTF-32 (aka UCS-4).
822 __codecvt_utf8_base
<char32_t
>::~__codecvt_utf8_base() { }
825 __codecvt_utf8_base
<char32_t
>::
826 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
827 const intern_type
*& __from_next
,
828 extern_type
* __to
, extern_type
* __to_end
,
829 extern_type
*& __to_next
) const
831 range
<const char32_t
> from
{ __from
, __from_end
};
832 range
<char> to
{ __to
, __to_end
};
833 auto res
= ucs4_out(from
, to
, _M_maxcode
, _M_mode
);
834 __from_next
= from
.next
;
840 __codecvt_utf8_base
<char32_t
>::
841 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
842 extern_type
*& __to_next
) const
849 __codecvt_utf8_base
<char32_t
>::
850 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
851 const extern_type
*& __from_next
,
852 intern_type
* __to
, intern_type
* __to_end
,
853 intern_type
*& __to_next
) const
855 range
<const char> from
{ __from
, __from_end
};
856 range
<char32_t
> to
{ __to
, __to_end
};
857 auto res
= ucs4_in(from
, to
, _M_maxcode
, _M_mode
);
858 __from_next
= from
.next
;
864 __codecvt_utf8_base
<char32_t
>::do_encoding() const throw()
868 __codecvt_utf8_base
<char32_t
>::do_always_noconv() const throw()
872 __codecvt_utf8_base
<char32_t
>::
873 do_length(state_type
&, const extern_type
* __from
,
874 const extern_type
* __end
, size_t __max
) const
876 __end
= ucs4_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
877 return __end
- __from
;
881 __codecvt_utf8_base
<char32_t
>::do_max_length() const throw()
884 #ifdef _GLIBCXX_USE_WCHAR_T
885 // Define members of codecvt_utf8<wchar_t> base class implementation.
886 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
888 __codecvt_utf8_base
<wchar_t>::~__codecvt_utf8_base() { }
891 __codecvt_utf8_base
<wchar_t>::
892 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
893 const intern_type
*& __from_next
,
894 extern_type
* __to
, extern_type
* __to_end
,
895 extern_type
*& __to_next
) const
897 range
<char> to
{ __to
, __to_end
};
898 #if __SIZEOF_WCHAR_T__ == 2
899 range
<const char16_t
> from
{
900 reinterpret_cast<const char16_t
*>(__from
),
901 reinterpret_cast<const char16_t
*>(__from_end
)
903 auto res
= ucs2_out(from
, to
, _M_maxcode
, _M_mode
);
904 #elif __SIZEOF_WCHAR_T__ == 4
905 range
<const char32_t
> from
{
906 reinterpret_cast<const char32_t
*>(__from
),
907 reinterpret_cast<const char32_t
*>(__from_end
)
909 auto res
= ucs4_out(from
, to
, _M_maxcode
, _M_mode
);
911 return codecvt_base::error
;
913 __from_next
= reinterpret_cast<const wchar_t*>(from
.next
);
919 __codecvt_utf8_base
<wchar_t>::
920 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
921 extern_type
*& __to_next
) const
928 __codecvt_utf8_base
<wchar_t>::
929 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
930 const extern_type
*& __from_next
,
931 intern_type
* __to
, intern_type
* __to_end
,
932 intern_type
*& __to_next
) const
934 range
<const char> from
{ __from
, __from_end
};
935 #if __SIZEOF_WCHAR_T__ == 2
937 reinterpret_cast<char16_t
*>(__to
),
938 reinterpret_cast<char16_t
*>(__to_end
)
940 auto res
= ucs2_in(from
, to
, _M_maxcode
, _M_mode
);
941 #elif __SIZEOF_WCHAR_T__ == 4
943 reinterpret_cast<char32_t
*>(__to
),
944 reinterpret_cast<char32_t
*>(__to_end
)
946 auto res
= ucs4_in(from
, to
, _M_maxcode
, _M_mode
);
948 return codecvt_base::error
;
950 __from_next
= from
.next
;
951 __to_next
= reinterpret_cast<wchar_t*>(to
.next
);
956 __codecvt_utf8_base
<wchar_t>::do_encoding() const throw()
960 __codecvt_utf8_base
<wchar_t>::do_always_noconv() const throw()
964 __codecvt_utf8_base
<wchar_t>::
965 do_length(state_type
&, const extern_type
* __from
,
966 const extern_type
* __end
, size_t __max
) const
968 #if __SIZEOF_WCHAR_T__ == 2
969 __end
= ucs2_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
970 #elif __SIZEOF_WCHAR_T__ == 4
971 __end
= ucs4_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
975 return __end
- __from
;
979 __codecvt_utf8_base
<wchar_t>::do_max_length() const throw()
983 // Define members of codecvt_utf16<char16_t> base class implementation.
984 // Converts from UTF-16 to UCS-2.
986 __codecvt_utf16_base
<char16_t
>::~__codecvt_utf16_base() { }
989 __codecvt_utf16_base
<char16_t
>::
990 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
991 const intern_type
*& __from_next
,
992 extern_type
* __to
, extern_type
* __to_end
,
993 extern_type
*& __to_next
) const
995 range
<const char16_t
> from
{ __from
, __from_end
};
997 reinterpret_cast<char16_t
*>(__to
),
998 reinterpret_cast<char16_t
*>(__to_end
)
1000 auto res
= ucs2_out(from
, to
, _M_maxcode
, _M_mode
);
1001 __from_next
= from
.next
;
1002 __to_next
= reinterpret_cast<char*>(to
.next
);
1006 codecvt_base::result
1007 __codecvt_utf16_base
<char16_t
>::
1008 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1009 extern_type
*& __to_next
) const
1015 codecvt_base::result
1016 __codecvt_utf16_base
<char16_t
>::
1017 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1018 const extern_type
*& __from_next
,
1019 intern_type
* __to
, intern_type
* __to_end
,
1020 intern_type
*& __to_next
) const
1022 range
<const char16_t
> from
{
1023 reinterpret_cast<const char16_t
*>(__from
),
1024 reinterpret_cast<const char16_t
*>(__from_end
)
1026 range
<char16_t
> to
{ __to
, __to_end
};
1027 auto res
= ucs2_in(from
, to
, _M_maxcode
, _M_mode
);
1028 __from_next
= reinterpret_cast<const char*>(from
.next
);
1029 __to_next
= to
.next
;
1034 __codecvt_utf16_base
<char16_t
>::do_encoding() const throw()
1038 __codecvt_utf16_base
<char16_t
>::do_always_noconv() const throw()
1042 __codecvt_utf16_base
<char16_t
>::
1043 do_length(state_type
&, const extern_type
* __from
,
1044 const extern_type
* __end
, size_t __max
) const
1046 auto next
= reinterpret_cast<const char16_t
*>(__from
);
1047 next
= ucs2_span(next
, reinterpret_cast<const char16_t
*>(__end
), __max
,
1048 _M_maxcode
, _M_mode
);
1049 return reinterpret_cast<const char*>(next
) - __from
;
1053 __codecvt_utf16_base
<char16_t
>::do_max_length() const throw()
1056 // Define members of codecvt_utf16<char32_t> base class implementation.
1057 // Converts from UTF-16 to UTF-32 (aka UCS-4).
1059 __codecvt_utf16_base
<char32_t
>::~__codecvt_utf16_base() { }
1061 codecvt_base::result
1062 __codecvt_utf16_base
<char32_t
>::
1063 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1064 const intern_type
*& __from_next
,
1065 extern_type
* __to
, extern_type
* __to_end
,
1066 extern_type
*& __to_next
) const
1068 range
<const char32_t
> from
{ __from
, __from_end
};
1070 reinterpret_cast<char16_t
*>(__to
),
1071 reinterpret_cast<char16_t
*>(__to_end
)
1073 auto res
= ucs4_out(from
, to
, _M_maxcode
, _M_mode
);
1074 __from_next
= from
.next
;
1075 __to_next
= reinterpret_cast<char*>(to
.next
);
1079 codecvt_base::result
1080 __codecvt_utf16_base
<char32_t
>::
1081 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1082 extern_type
*& __to_next
) const
1088 codecvt_base::result
1089 __codecvt_utf16_base
<char32_t
>::
1090 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1091 const extern_type
*& __from_next
,
1092 intern_type
* __to
, intern_type
* __to_end
,
1093 intern_type
*& __to_next
) const
1095 range
<const char16_t
> from
{
1096 reinterpret_cast<const char16_t
*>(__from
),
1097 reinterpret_cast<const char16_t
*>(__from_end
)
1099 range
<char32_t
> to
{ __to
, __to_end
};
1100 auto res
= ucs4_in(from
, to
, _M_maxcode
, _M_mode
);
1101 __from_next
= reinterpret_cast<const char*>(from
.next
);
1102 __to_next
= to
.next
;
1107 __codecvt_utf16_base
<char32_t
>::do_encoding() const throw()
1111 __codecvt_utf16_base
<char32_t
>::do_always_noconv() const throw()
1115 __codecvt_utf16_base
<char32_t
>::
1116 do_length(state_type
&, const extern_type
* __from
,
1117 const extern_type
* __end
, size_t __max
) const
1119 auto next
= reinterpret_cast<const char16_t
*>(__from
);
1120 next
= ucs4_span(next
, reinterpret_cast<const char16_t
*>(__end
), __max
,
1121 _M_maxcode
, _M_mode
);
1122 return reinterpret_cast<const char*>(next
) - __from
;
1126 __codecvt_utf16_base
<char32_t
>::do_max_length() const throw()
1129 #ifdef _GLIBCXX_USE_WCHAR_T
1130 // Define members of codecvt_utf16<wchar_t> base class implementation.
1131 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1133 __codecvt_utf16_base
<wchar_t>::~__codecvt_utf16_base() { }
1135 codecvt_base::result
1136 __codecvt_utf16_base
<wchar_t>::
1137 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1138 const intern_type
*& __from_next
,
1139 extern_type
* __to
, extern_type
* __to_end
,
1140 extern_type
*& __to_next
) const
1142 range
<char> to
{ __to
, __to_end
};
1143 #if __SIZEOF_WCHAR_T__ == 2
1144 range
<const char16_t
> from
{
1145 reinterpret_cast<const char16_t
*>(__from
),
1146 reinterpret_cast<const char16_t
*>(__from_end
)
1148 auto res
= ucs2_out(from
, to
, _M_maxcode
, _M_mode
);
1149 #elif __SIZEOF_WCHAR_T__ == 4
1150 range
<const char32_t
> from
{
1151 reinterpret_cast<const char32_t
*>(__from
),
1152 reinterpret_cast<const char32_t
*>(__from_end
)
1154 auto res
= ucs4_out(from
, to
, _M_maxcode
, _M_mode
);
1156 return codecvt_base::error
;
1158 __from_next
= reinterpret_cast<const wchar_t*>(from
.next
);
1159 __to_next
= to
.next
;
1163 codecvt_base::result
1164 __codecvt_utf16_base
<wchar_t>::
1165 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1166 extern_type
*& __to_next
) const
1172 codecvt_base::result
1173 __codecvt_utf16_base
<wchar_t>::
1174 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1175 const extern_type
*& __from_next
,
1176 intern_type
* __to
, intern_type
* __to_end
,
1177 intern_type
*& __to_next
) const
1179 range
<const char> from
{ __from
, __from_end
};
1180 #if __SIZEOF_WCHAR_T__ == 2
1182 reinterpret_cast<char16_t
*>(__to
),
1183 reinterpret_cast<char16_t
*>(__to_end
)
1185 auto res
= ucs2_in(from
, to
, _M_maxcode
, _M_mode
);
1186 #elif __SIZEOF_WCHAR_T__ == 4
1188 reinterpret_cast<char32_t
*>(__to
),
1189 reinterpret_cast<char32_t
*>(__to_end
)
1191 auto res
= ucs4_in(from
, to
, _M_maxcode
, _M_mode
);
1193 return codecvt_base::error
;
1195 __from_next
= from
.next
;
1196 __to_next
= reinterpret_cast<wchar_t*>(to
.next
);
1201 __codecvt_utf16_base
<wchar_t>::do_encoding() const throw()
1205 __codecvt_utf16_base
<wchar_t>::do_always_noconv() const throw()
1209 __codecvt_utf16_base
<wchar_t>::
1210 do_length(state_type
&, const extern_type
* __from
,
1211 const extern_type
* __end
, size_t __max
) const
1213 auto next
= reinterpret_cast<const char16_t
*>(__from
);
1214 #if __SIZEOF_WCHAR_T__ == 2
1215 next
= ucs2_span(next
, reinterpret_cast<const char16_t
*>(__end
), __max
,
1216 _M_maxcode
, _M_mode
);
1217 #elif __SIZEOF_WCHAR_T__ == 4
1218 next
= ucs4_span(next
, reinterpret_cast<const char16_t
*>(__end
), __max
,
1219 _M_maxcode
, _M_mode
);
1221 return reinterpret_cast<const char*>(next
) - __from
;
1225 __codecvt_utf16_base
<wchar_t>::do_max_length() const throw()
1229 // Define members of codecvt_utf8_utf16<char16_t> base class implementation.
1230 // Converts from UTF-8 to UTF-16.
1232 __codecvt_utf8_utf16_base
<char16_t
>::~__codecvt_utf8_utf16_base() { }
1234 codecvt_base::result
1235 __codecvt_utf8_utf16_base
<char16_t
>::
1236 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1237 const intern_type
*& __from_next
,
1238 extern_type
* __to
, extern_type
* __to_end
,
1239 extern_type
*& __to_next
) const
1241 range
<const char16_t
> from
{ __from
, __from_end
};
1242 range
<char> to
{ __to
, __to_end
};
1243 auto res
= utf16_out(from
, to
, _M_maxcode
, _M_mode
);
1244 __from_next
= from
.next
;
1245 __to_next
= to
.next
;
1249 codecvt_base::result
1250 __codecvt_utf8_utf16_base
<char16_t
>::
1251 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1252 extern_type
*& __to_next
) const
1258 codecvt_base::result
1259 __codecvt_utf8_utf16_base
<char16_t
>::
1260 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1261 const extern_type
*& __from_next
,
1262 intern_type
* __to
, intern_type
* __to_end
,
1263 intern_type
*& __to_next
) const
1265 range
<const char> from
{ __from
, __from_end
};
1266 range
<char16_t
> to
{ __to
, __to_end
};
1267 codecvt_mode mode
= codecvt_mode(_M_mode
| (consume_header
|generate_header
));
1268 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1269 mode
= codecvt_mode(mode
| little_endian
);
1271 auto res
= utf16_in(from
, to
, _M_maxcode
, mode
);
1272 __from_next
= from
.next
;
1273 __to_next
= to
.next
;
1278 __codecvt_utf8_utf16_base
<char16_t
>::do_encoding() const throw()
1282 __codecvt_utf8_utf16_base
<char16_t
>::do_always_noconv() const throw()
1286 __codecvt_utf8_utf16_base
<char16_t
>::
1287 do_length(state_type
&, const extern_type
* __from
,
1288 const extern_type
* __end
, size_t __max
) const
1290 __end
= utf16_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
1291 return __end
- __from
;
1295 __codecvt_utf8_utf16_base
<char16_t
>::do_max_length() const throw()
1297 // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
1298 // whereas 4 byte sequences require two 16-bit code units.
1302 // Define members of codecvt_utf8_utf16<char32_t> base class implementation.
1303 // Converts from UTF-8 to UTF-16.
1305 __codecvt_utf8_utf16_base
<char32_t
>::~__codecvt_utf8_utf16_base() { }
1307 codecvt_base::result
1308 __codecvt_utf8_utf16_base
<char32_t
>::
1309 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1310 const intern_type
*& __from_next
,
1311 extern_type
* __to
, extern_type
* __to_end
,
1312 extern_type
*& __to_next
) const
1314 range
<const char32_t
> from
{ __from
, __from_end
};
1315 range
<char> to
{ __to
, __to_end
};
1316 auto res
= utf16_out(from
, to
, _M_maxcode
, _M_mode
);
1317 __from_next
= from
.next
;
1318 __to_next
= to
.next
;
1322 codecvt_base::result
1323 __codecvt_utf8_utf16_base
<char32_t
>::
1324 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1325 extern_type
*& __to_next
) const
1331 codecvt_base::result
1332 __codecvt_utf8_utf16_base
<char32_t
>::
1333 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1334 const extern_type
*& __from_next
,
1335 intern_type
* __to
, intern_type
* __to_end
,
1336 intern_type
*& __to_next
) const
1338 range
<const char> from
{ __from
, __from_end
};
1339 range
<char32_t
> to
{ __to
, __to_end
};
1340 auto res
= utf16_in(from
, to
, _M_maxcode
, _M_mode
);
1341 __from_next
= from
.next
;
1342 __to_next
= to
.next
;
1347 __codecvt_utf8_utf16_base
<char32_t
>::do_encoding() const throw()
1351 __codecvt_utf8_utf16_base
<char32_t
>::do_always_noconv() const throw()
1355 __codecvt_utf8_utf16_base
<char32_t
>::
1356 do_length(state_type
&, const extern_type
* __from
,
1357 const extern_type
* __end
, size_t __max
) const
1359 __end
= utf16_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
1360 return __end
- __from
;
1364 __codecvt_utf8_utf16_base
<char32_t
>::do_max_length() const throw()
1366 // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
1367 // whereas 4 byte sequences require two 16-bit code units.
1371 #ifdef _GLIBCXX_USE_WCHAR_T
1372 // Define members of codecvt_utf8_utf16<wchar_t> base class implementation.
1373 // Converts from UTF-8 to UTF-16.
1375 __codecvt_utf8_utf16_base
<wchar_t>::~__codecvt_utf8_utf16_base() { }
1377 codecvt_base::result
1378 __codecvt_utf8_utf16_base
<wchar_t>::
1379 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1380 const intern_type
*& __from_next
,
1381 extern_type
* __to
, extern_type
* __to_end
,
1382 extern_type
*& __to_next
) const
1384 range
<const wchar_t> from
{ __from
, __from_end
};
1385 range
<char> to
{ __to
, __to_end
};
1386 auto res
= utf16_out(from
, to
, _M_maxcode
, _M_mode
);
1387 __from_next
= from
.next
;
1388 __to_next
= to
.next
;
1392 codecvt_base::result
1393 __codecvt_utf8_utf16_base
<wchar_t>::
1394 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1395 extern_type
*& __to_next
) const
1401 codecvt_base::result
1402 __codecvt_utf8_utf16_base
<wchar_t>::
1403 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1404 const extern_type
*& __from_next
,
1405 intern_type
* __to
, intern_type
* __to_end
,
1406 intern_type
*& __to_next
) const
1408 range
<const char> from
{ __from
, __from_end
};
1409 range
<wchar_t> to
{ __to
, __to_end
};
1410 auto res
= utf16_in(from
, to
, _M_maxcode
, _M_mode
);
1411 __from_next
= from
.next
;
1412 __to_next
= to
.next
;
1417 __codecvt_utf8_utf16_base
<wchar_t>::do_encoding() const throw()
1421 __codecvt_utf8_utf16_base
<wchar_t>::do_always_noconv() const throw()
1425 __codecvt_utf8_utf16_base
<wchar_t>::
1426 do_length(state_type
&, const extern_type
* __from
,
1427 const extern_type
* __end
, size_t __max
) const
1429 __end
= utf16_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
1430 return __end
- __from
;
1434 __codecvt_utf8_utf16_base
<wchar_t>::do_max_length() const throw()
1436 // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
1437 // whereas 4 byte sequences require two 16-bit code units.
1442 inline template class __codecvt_abstract_base
<char16_t
, char, mbstate_t>;
1443 inline template class __codecvt_abstract_base
<char32_t
, char, mbstate_t>;
1444 template class codecvt_byname
<char16_t
, char, mbstate_t>;
1445 template class codecvt_byname
<char32_t
, char, mbstate_t>;
1447 _GLIBCXX_END_NAMESPACE_VERSION
1449 #endif // _GLIBCXX_USE_C99_STDINT_TR1