2 * Copyright 2012-2017 Jacob Lifshay
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to deal
6 * in the Software without restriction, including without limitation the rights
7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 * copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in all
12 * copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * https://github.com/programmerjake/hashlife-voxels/blob/5dda3bc240e1e89f43606316d1c3202221e3b06b/util/text.h
30 #include <type_traits>
37 #include "string_view.h"
45 constexpr char32_t replacement_character
= U
'\uFFFD';
47 template <typename Input_iterator
, typename Sentinel
>
48 typename
std::char_traits
<char32_t
>::int_type
decode_utf8(
51 bool allow_surrogate_code_points
= true,
52 bool allow_2_byte_null
= false,
53 typename
std::char_traits
<char32_t
>::int_type error_value
=
54 replacement_character
) noexcept(noexcept(++iter
) && noexcept(static_cast<char>(*iter
))
55 && noexcept(iter
== sentinel
? 0 : 0))
58 return std::char_traits
<char32_t
>::eof();
59 auto byte0
= static_cast<std::uint8_t>(static_cast<char>(*iter
));
63 if(allow_2_byte_null
&& byte0
== 0xC0)
67 auto byte1
= static_cast<std::uint8_t>(static_cast<char>(*iter
));
73 if(byte0
> 0xF4 || byte0
< 0xC2)
77 auto byte1
= static_cast<std::uint8_t>(static_cast<char>(*iter
));
78 if(byte1
< 0x80 || byte1
>= 0xC0)
83 return (static_cast<std::uint_fast32_t>(byte0
& 0x1F) << 6) | (byte1
& 0x3F);
85 if(byte0
== 0xE0 && byte1
< 0xA0)
87 if(byte0
== 0xF0 && byte1
< 0x90)
89 if(byte0
== 0xF4 && byte1
>= 0x90)
91 if(!allow_surrogate_code_points
&& byte0
== 0xED && byte1
>= 0xA0)
96 auto byte2
= static_cast<std::uint8_t>(static_cast<char>(*iter
));
98 if(byte2
< 0x80 || byte2
>= 0xC0)
101 return (static_cast<std::uint_fast32_t>(byte0
& 0xF) << 12)
102 | (static_cast<std::uint_fast32_t>(byte1
& 0x3F) << 6) | (byte2
& 0x3F);
105 auto byte3
= static_cast<std::uint8_t>(static_cast<char>(*iter
));
107 if(byte3
< 0x80 || byte3
>= 0xC0)
109 return (static_cast<std::uint_fast32_t>(byte0
& 0x7) << 18)
110 | (static_cast<std::uint_fast32_t>(byte1
& 0x3F) << 12)
111 | (static_cast<std::uint_fast32_t>(byte2
& 0x3F) << 6) | (byte3
& 0x3F);
114 template <typename T
, std::size_t N
>
115 struct Encoded_character final
117 static constexpr std::size_t max_Chars
= N
;
119 static_assert(max_Chars
!= 0, "");
120 Char_type chars
[max_Chars
];
122 constexpr Char_type
&front()
126 constexpr const Char_type
&front() const
130 constexpr Char_type
&back()
134 constexpr const Char_type
&back() const
138 typedef const Char_type
*const_iterator
;
139 typedef Char_type
*iterator
;
140 constexpr const_iterator
begin() const
144 constexpr const_iterator
end() const
146 return begin() + used
;
148 constexpr const_iterator
cbegin() const
152 constexpr const_iterator
cend() const
154 return begin() + used
;
156 constexpr iterator
begin()
160 constexpr iterator
end()
162 return begin() + used
;
164 constexpr std::size_t capacity() const
168 constexpr std::size_t size() const
172 constexpr const Char_type
&operator[](std::size_t index
) const
174 assert(index
< used
);
177 constexpr Char_type
&operator[](std::size_t index
)
179 assert(index
< used
);
182 constexpr Encoded_character() : chars(), used(0)
187 static constexpr Char_type
implicit_conversion_helper(Char_type ch
) noexcept
193 template <typename
... Args
>
194 constexpr Encoded_character(Args
&&... args
)
195 : chars
{implicit_conversion_helper(std::forward
<Args
>(args
))...}, used(sizeof...(args
))
197 static_assert(sizeof...(args
) <= max_Chars
, "");
199 template <typename Char_traits
, typename Allocator
>
200 operator std::basic_string
<Char_type
, Char_traits
, Allocator
>() const
202 return std::basic_string
<Char_type
, Char_traits
, Allocator
>(begin(), end());
204 template <typename Char_traits
, typename Allocator
>
205 friend std::basic_string
<Char_type
, Char_traits
, Allocator
> operator+(
206 std::basic_string
<Char_type
, Char_traits
, Allocator
> a
, const Encoded_character
&b
)
208 a
.append(b
.begin(), b
.end());
211 template <typename Char_traits
, typename Allocator
>
212 friend std::basic_string
<Char_type
, Char_traits
, Allocator
> operator+(
213 const Encoded_character
&a
, std::basic_string
<Char_type
, Char_traits
, Allocator
> b
)
215 b
.insert(b
.begin(), a
.begin(), a
.end());
218 template <std::size_t N2
>
219 friend std::basic_string
<Char_type
> operator+(const Encoded_character
&a
,
220 const Encoded_character
<Char_type
, N2
> &b
)
222 std::basic_string
<Char_type
> retval
;
223 retval
.reserve(a
.size() + b
.size());
224 retval
.append(a
.begin(), a
.end());
225 retval
.append(b
.begin(), b
.end());
228 template <typename Traits
>
229 friend std::basic_ostream
<Char_type
, Traits
> &operator<<(
230 std::basic_ostream
<Char_type
, Traits
> &os
, const Encoded_character
&a
)
232 os
<< static_cast<std::basic_string
<Char_type
, Traits
>>(a
);
237 Encoded_character
<char, 4> encode_utf8(char32_t ch
, bool use_2_byte_null
= false) noexcept
239 assert(ch
< 0x10FFFFUL
&& ch
>= 0);
240 if(use_2_byte_null
&& ch
== 0)
241 return Encoded_character
<char, 4>(0xC0U
, 0x80U
);
243 return Encoded_character
<char, 4>(ch
);
245 return Encoded_character
<char, 4>(0xC0 | (ch
>> 6), 0x80 | (ch
& 0x3F));
247 return Encoded_character
<char, 4>(
248 0xE0 | (ch
>> 12), 0x80 | ((ch
>> 6) & 0x3F), 0x80 | (ch
& 0x3F));
249 return Encoded_character
<char, 4>(0xF0 | (ch
>> 18),
250 0x80 | ((ch
>> 12) & 0x3F),
251 0x80 | ((ch
>> 6) & 0x3F),
255 template <typename Input_iterator
, typename Sentinel
>
256 typename
std::char_traits
<char32_t
>::int_type
decode_utf16(
257 Input_iterator
&iter
,
259 bool allow_unpaired_surrogate_code_units
= true,
260 typename
std::char_traits
<char32_t
>::int_type error_value
=
261 replacement_character
) noexcept(noexcept(++iter
) && noexcept(static_cast<char16_t
>(*iter
))
262 && noexcept(iter
== sentinel
? 0 : 0))
265 return std::char_traits
<char32_t
>::eof();
266 auto unit0
= static_cast<std::uint16_t>(static_cast<char16_t
>(*iter
));
268 if(unit0
>= 0xD800U
&& unit0
< 0xDC00U
)
271 return allow_unpaired_surrogate_code_units
? unit0
: error_value
;
272 auto unit1
= static_cast<std::uint16_t>(static_cast<char16_t
>(*iter
));
273 if(unit1
< 0xDC00U
|| unit1
>= 0xE000U
)
274 return allow_unpaired_surrogate_code_units
? unit0
: error_value
;
276 return 0x10000UL
+ ((unit0
& 0x3FF) << 10) + (unit1
& 0x3FF);
281 Encoded_character
<char16_t
, 2> encode_utf16(char32_t ch
) noexcept
283 assert(ch
< 0x10FFFFUL
&& ch
>= 0);
285 return Encoded_character
<char16_t
, 2>(ch
);
286 return Encoded_character
<char16_t
, 2>(0xD800U
| ((ch
- 0x10000UL
) >> 10),
287 0xDC00U
| ((ch
- 0x10000UL
) & 0x3FF));
290 template <typename Input_iterator
, typename Sentinel
>
291 typename
std::char_traits
<char32_t
>::int_type
decode_utf32(
292 Input_iterator
&iter
,
294 bool allow_Surrogate_Code_Units
= true,
295 typename
std::char_traits
<char32_t
>::int_type error_value
=
296 replacement_character
) noexcept(noexcept(++iter
) && noexcept(static_cast<char32_t
>(*iter
))
297 && noexcept(iter
== sentinel
? 0 : 0))
300 return std::char_traits
<char32_t
>::eof();
301 auto retval
= static_cast<std::uint32_t>(static_cast<char32_t
>(*iter
));
303 if(retval
> 0x10FFFFUL
)
305 if(!allow_Surrogate_Code_Units
&& retval
>= 0xD800U
&& retval
< 0xE000U
)
310 Encoded_character
<char32_t
, 1> encode_utf32(char32_t ch
) noexcept
312 return Encoded_character
<char32_t
, 1>(ch
);
315 static_assert(std::numeric_limits
<wchar_t>::radix
== 2, "");
316 static_assert(std::numeric_limits
<wchar_t>::digits
317 + static_cast<std::size_t>(std::is_signed
<wchar_t>::value
)
321 constexpr bool is_wide_character_utf16
= std::numeric_limits
<wchar_t>::digits
<= 16;
323 Encoded_character
<wchar_t, 2> encode_wide(char32_t ch
) noexcept
325 if(is_wide_character_utf16
)
327 auto result
= encode_utf16(ch
);
328 Encoded_character
<wchar_t, 2> retval
;
329 retval
.used
= result
.used
;
330 for(std::size_t i
= 0; i
< result
.size(); i
++)
332 retval
[i
] = static_cast<wchar_t>(result
[i
]);
336 return Encoded_character
<wchar_t, 2>(static_cast<wchar_t>(ch
));
339 template <typename Input_iterator
, typename Sentinel
>
340 typename
std::char_traits
<char32_t
>::int_type
decode_wide(
341 Input_iterator
&iter
,
343 bool allow_unpaired_surrogate_code_units
= true,
344 typename
std::char_traits
<char32_t
>::int_type error_value
=
345 replacement_character
) noexcept(noexcept(++iter
) && noexcept(static_cast<wchar_t>(*iter
))
346 && noexcept(iter
== sentinel
? 0 : 0))
348 struct Iterator_wrapper
350 Input_iterator
&iter
;
351 Iterator_wrapper(Input_iterator
&iter
) : iter(iter
)
360 return static_cast<wchar_t>(*iter
);
362 bool operator==(Sentinel
&sentinel
)
364 return iter
== sentinel
;
367 Iterator_wrapper
iterator_wrapper(iter
);
368 if(is_wide_character_utf16
)
369 return decode_utf16(iterator_wrapper
,
371 allow_unpaired_surrogate_code_units
,
374 iterator_wrapper
, std::move(sentinel
), allow_unpaired_surrogate_code_units
, error_value
);
377 struct Convert_options final
379 typename
std::char_traits
<char32_t
>::int_type error_value
= replacement_character
;
380 bool allow_unpaired_surrogate_code_points
= true;
381 bool allow_2_byte_null
= false;
382 bool use_2_byte_null
= false;
383 constexpr Convert_options()
386 constexpr Convert_options(typename
std::char_traits
<char32_t
>::int_type error_value
,
387 bool allow_unpaired_surrogate_code_points
,
388 bool allow_2_byte_null
,
389 bool use_2_byte_null
)
390 : error_value(error_value
),
391 allow_unpaired_surrogate_code_points(allow_unpaired_surrogate_code_points
),
392 allow_2_byte_null(allow_2_byte_null
),
393 use_2_byte_null(use_2_byte_null
)
396 static constexpr Convert_options
strict(
397 typename
std::char_traits
<char32_t
>::int_type error_value
= replacement_character
)
399 return Convert_options(error_value
, false, false, false);
401 static constexpr Convert_options
java(
402 typename
std::char_traits
<char32_t
>::int_type error_value
= replacement_character
)
404 return Convert_options(error_value
, true, true, true);
408 template <typename Char_type
>
409 struct Decode_encode_functions
411 template <typename Input_iterator
, typename Sentinel
>
412 static typename
std::char_traits
<char32_t
>::int_type
decode(
413 Input_iterator
&iter
, Sentinel sentinel
, const Convert_options
&convert_options
) = delete;
414 static Encoded_character
<Char_type
, 1> encode(
415 char32_t ch
, const Convert_options
&convert_options
) noexcept
= delete;
419 struct Decode_encode_functions
<char>
421 template <typename Input_iterator
, typename Sentinel
>
422 static typename
std::char_traits
<char32_t
>::int_type
decode(
423 Input_iterator
&iter
,
425 const Convert_options
426 &convert_options
) noexcept(noexcept(decode_utf8(std::declval
<Input_iterator
&>(),
427 std::declval
<Sentinel
&&>())))
429 return decode_utf8(iter
,
431 convert_options
.allow_unpaired_surrogate_code_points
,
432 convert_options
.allow_2_byte_null
,
433 convert_options
.error_value
);
435 static Encoded_character
<char, 4> encode(char32_t ch
,
436 const Convert_options
&convert_options
) noexcept
438 return encode_utf8(ch
, convert_options
.use_2_byte_null
);
443 struct Decode_encode_functions
<char16_t
>
445 template <typename Input_iterator
, typename Sentinel
>
446 static typename
std::char_traits
<char32_t
>::int_type
decode(
447 Input_iterator
&iter
,
449 const Convert_options
450 &convert_options
) noexcept(noexcept(decode_utf16(std::declval
<Input_iterator
&>(),
451 std::declval
<Sentinel
&&>())))
453 return decode_utf16(iter
,
455 convert_options
.allow_unpaired_surrogate_code_points
,
456 convert_options
.error_value
);
458 static Encoded_character
<char16_t
, 2> encode(char32_t ch
,
459 const Convert_options
&convert_options
) noexcept
461 return encode_utf16(ch
);
466 struct Decode_encode_functions
<char32_t
>
468 template <typename Input_iterator
, typename Sentinel
>
469 static typename
std::char_traits
<char32_t
>::int_type
decode(
470 Input_iterator
&iter
,
472 const Convert_options
473 &convert_options
) noexcept(noexcept(decode_utf32(std::declval
<Input_iterator
&>(),
474 std::declval
<Sentinel
&&>())))
476 return decode_utf32(iter
,
478 convert_options
.allow_unpaired_surrogate_code_points
,
479 convert_options
.error_value
);
481 static Encoded_character
<char32_t
, 1> encode(char32_t ch
,
482 const Convert_options
&convert_options
) noexcept
484 return encode_utf32(ch
);
489 struct Decode_encode_functions
<wchar_t>
491 template <typename Input_iterator
, typename Sentinel
>
492 static typename
std::char_traits
<char32_t
>::int_type
decode(
493 Input_iterator
&iter
,
495 const Convert_options
496 &convert_options
) noexcept(noexcept(decode_wide(std::declval
<Input_iterator
&>(),
497 std::declval
<Sentinel
&&>())))
499 return decode_wide(iter
,
501 convert_options
.allow_unpaired_surrogate_code_points
,
502 convert_options
.error_value
);
504 static Encoded_character
<wchar_t, 2> encode(char32_t ch
,
505 const Convert_options
&convert_options
) noexcept
507 return encode_wide(ch
);
513 template <typename Target
, typename Source
>
514 struct String_cast_helper
;
516 template <typename Target_Char_type
,
517 typename Target_Traits
,
518 typename Target_Allocator
,
519 typename Source_Char_type
,
520 typename Source_Traits
>
521 struct String_cast_helper
<std::basic_string
<Target_Char_type
, Target_Traits
, Target_Allocator
>,
522 basic_string_view
<Source_Char_type
, Source_Traits
>>
524 static std::basic_string
<Target_Char_type
, Target_Traits
, Target_Allocator
> run(
525 basic_string_view
<Source_Char_type
, Source_Traits
> source
,
526 const Convert_options
&convert_options
)
528 std::basic_string
<Target_Char_type
, Target_Traits
, Target_Allocator
> retval
;
529 for(auto iter
= source
.begin(); iter
!= source
.end();)
531 retval
= std::move(retval
) + Decode_encode_functions
<Target_Char_type
>::encode(
532 Decode_encode_functions
<Source_Char_type
>::decode(
533 iter
, source
.end(), convert_options
),
540 template <typename Char_type
,
541 typename Target_Traits
,
542 typename Target_Allocator
,
543 typename Source_Traits
>
544 struct String_cast_helper
<std::basic_string
<Char_type
, Target_Traits
, Target_Allocator
>,
545 basic_string_view
<Char_type
, Source_Traits
>>
547 static std::basic_string
<Char_type
, Target_Traits
, Target_Allocator
> run(
548 basic_string_view
<Char_type
, Source_Traits
> source
, const Convert_options
&)
550 return std::basic_string
<Char_type
, Target_Traits
, Target_Allocator
>(source
.begin(),
556 template <typename Target
, typename Source_Char_type
, typename Source_Traits
>
557 Target
string_cast(basic_string_view
<Source_Char_type
, Source_Traits
> source
,
558 const Convert_options
&convert_options
)
560 return detail::String_cast_helper
<Target
, basic_string_view
<Source_Char_type
, Source_Traits
>>::
561 run(source
, convert_options
);
564 template <typename Target
, typename Source_Char_type
, typename Source_Traits
>
565 Target
string_cast(basic_string_view
<Source_Char_type
, Source_Traits
> source
)
567 return detail::String_cast_helper
<Target
, basic_string_view
<Source_Char_type
, Source_Traits
>>::
568 run(source
, Convert_options());
574 #endif /* UTIL_TEXT_H_ */