working on util::filesystem
[kazan.git] / src / util / text.h
1 /*
2 * Copyright 2012-2017 Jacob Lifshay
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to deal
6 * in the Software without restriction, including without limitation the rights
7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 * copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in all
12 * copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 *
22 */
23 /* translated from
24 * https://github.com/programmerjake/hashlife-voxels/blob/5dda3bc240e1e89f43606316d1c3202221e3b06b/util/text.h
25 */
26
27 #ifndef UTIL_TEXT_H_
28 #define UTIL_TEXT_H_
29
30 #include <type_traits>
31 #include <utility>
32 #include <limits>
33 #include <cstdint>
34 #include <string>
35 #include <ostream>
36 #include <cassert>
37 #include "string_view.h"
38
39 namespace vulkan_cpu
40 {
41 namespace util
42 {
43 namespace text
44 {
45 constexpr char32_t replacement_character = U'\uFFFD';
46
47 template <typename Input_iterator, typename Sentinel>
48 typename std::char_traits<char32_t>::int_type decode_utf8(
49 Input_iterator &iter,
50 Sentinel sentinel,
51 bool allow_surrogate_code_points = true,
52 bool allow_2_byte_null = false,
53 typename std::char_traits<char32_t>::int_type error_value =
54 replacement_character) noexcept(noexcept(++iter) && noexcept(static_cast<char>(*iter))
55 && noexcept(iter == sentinel ? 0 : 0))
56 {
57 if(iter == sentinel)
58 return std::char_traits<char32_t>::eof();
59 auto byte0 = static_cast<std::uint8_t>(static_cast<char>(*iter));
60 ++iter;
61 if(byte0 < 0x80)
62 return byte0;
63 if(allow_2_byte_null && byte0 == 0xC0)
64 {
65 if(iter == sentinel)
66 return error_value;
67 auto byte1 = static_cast<std::uint8_t>(static_cast<char>(*iter));
68 ++iter;
69 if(byte1 != 0x80)
70 return error_value;
71 return 0;
72 }
73 if(byte0 > 0xF4 || byte0 < 0xC2)
74 return error_value;
75 if(iter == sentinel)
76 return error_value;
77 auto byte1 = static_cast<std::uint8_t>(static_cast<char>(*iter));
78 if(byte1 < 0x80 || byte1 >= 0xC0)
79 return error_value;
80 if(byte0 < 0xE0)
81 {
82 ++iter;
83 return (static_cast<std::uint_fast32_t>(byte0 & 0x1F) << 6) | (byte1 & 0x3F);
84 }
85 if(byte0 == 0xE0 && byte1 < 0xA0)
86 return error_value;
87 if(byte0 == 0xF0 && byte1 < 0x90)
88 return error_value;
89 if(byte0 == 0xF4 && byte1 >= 0x90)
90 return error_value;
91 if(!allow_surrogate_code_points && byte0 == 0xED && byte1 >= 0xA0)
92 return error_value;
93 if(iter == sentinel)
94 return error_value;
95 ++iter;
96 auto byte2 = static_cast<std::uint8_t>(static_cast<char>(*iter));
97 ++iter;
98 if(byte2 < 0x80 || byte2 >= 0xC0)
99 return error_value;
100 if(byte0 < 0xF0)
101 return (static_cast<std::uint_fast32_t>(byte0 & 0xF) << 12)
102 | (static_cast<std::uint_fast32_t>(byte1 & 0x3F) << 6) | (byte2 & 0x3F);
103 if(iter == sentinel)
104 return error_value;
105 auto byte3 = static_cast<std::uint8_t>(static_cast<char>(*iter));
106 ++iter;
107 if(byte3 < 0x80 || byte3 >= 0xC0)
108 return error_value;
109 return (static_cast<std::uint_fast32_t>(byte0 & 0x7) << 18)
110 | (static_cast<std::uint_fast32_t>(byte1 & 0x3F) << 12)
111 | (static_cast<std::uint_fast32_t>(byte2 & 0x3F) << 6) | (byte3 & 0x3F);
112 }
113
114 template <typename T, std::size_t N>
115 struct Encoded_character final
116 {
117 static constexpr std::size_t max_Chars = N;
118 typedef T Char_type;
119 static_assert(max_Chars != 0, "");
120 Char_type chars[max_Chars];
121 std::size_t used;
122 constexpr Char_type &front()
123 {
124 return chars[0];
125 }
126 constexpr const Char_type &front() const
127 {
128 return chars[0];
129 }
130 constexpr Char_type &back()
131 {
132 return chars[0];
133 }
134 constexpr const Char_type &back() const
135 {
136 return chars[0];
137 }
138 typedef const Char_type *const_iterator;
139 typedef Char_type *iterator;
140 constexpr const_iterator begin() const
141 {
142 return &chars[0];
143 }
144 constexpr const_iterator end() const
145 {
146 return begin() + used;
147 }
148 constexpr const_iterator cbegin() const
149 {
150 return &chars[0];
151 }
152 constexpr const_iterator cend() const
153 {
154 return begin() + used;
155 }
156 constexpr iterator begin()
157 {
158 return &chars[0];
159 }
160 constexpr iterator end()
161 {
162 return begin() + used;
163 }
164 constexpr std::size_t capacity() const
165 {
166 return max_Chars;
167 }
168 constexpr std::size_t size() const
169 {
170 return used;
171 }
172 constexpr const Char_type &operator[](std::size_t index) const
173 {
174 assert(index < used);
175 return chars[index];
176 }
177 constexpr Char_type &operator[](std::size_t index)
178 {
179 assert(index < used);
180 return chars[index];
181 }
182 constexpr Encoded_character() : chars(), used(0)
183 {
184 }
185
186 private:
187 static constexpr Char_type implicit_conversion_helper(Char_type ch) noexcept
188 {
189 return ch;
190 }
191
192 public:
193 template <typename... Args>
194 constexpr Encoded_character(Args &&... args)
195 : chars{implicit_conversion_helper(std::forward<Args>(args))...}, used(sizeof...(args))
196 {
197 static_assert(sizeof...(args) <= max_Chars, "");
198 }
199 template <typename Char_traits, typename Allocator>
200 operator std::basic_string<Char_type, Char_traits, Allocator>() const
201 {
202 return std::basic_string<Char_type, Char_traits, Allocator>(begin(), end());
203 }
204 template <typename Char_traits, typename Allocator>
205 friend std::basic_string<Char_type, Char_traits, Allocator> operator+(
206 std::basic_string<Char_type, Char_traits, Allocator> a, const Encoded_character &b)
207 {
208 a.append(b.begin(), b.end());
209 return a;
210 }
211 template <typename Char_traits, typename Allocator>
212 friend std::basic_string<Char_type, Char_traits, Allocator> operator+(
213 const Encoded_character &a, std::basic_string<Char_type, Char_traits, Allocator> b)
214 {
215 b.insert(b.begin(), a.begin(), a.end());
216 return b;
217 }
218 template <std::size_t N2>
219 friend std::basic_string<Char_type> operator+(const Encoded_character &a,
220 const Encoded_character<Char_type, N2> &b)
221 {
222 std::basic_string<Char_type> retval;
223 retval.reserve(a.size() + b.size());
224 retval.append(a.begin(), a.end());
225 retval.append(b.begin(), b.end());
226 return retval;
227 }
228 template <typename Traits>
229 friend std::basic_ostream<Char_type, Traits> &operator<<(
230 std::basic_ostream<Char_type, Traits> &os, const Encoded_character &a)
231 {
232 os << static_cast<std::basic_string<Char_type, Traits>>(a);
233 return os;
234 }
235 };
236
237 Encoded_character<char, 4> encode_utf8(char32_t ch, bool use_2_byte_null = false) noexcept
238 {
239 assert(ch < 0x10FFFFUL && ch >= 0);
240 if(use_2_byte_null && ch == 0)
241 return Encoded_character<char, 4>(0xC0U, 0x80U);
242 if(ch < 0x80)
243 return Encoded_character<char, 4>(ch);
244 if(ch < 0x800)
245 return Encoded_character<char, 4>(0xC0 | (ch >> 6), 0x80 | (ch & 0x3F));
246 if(ch < 0x10000UL)
247 return Encoded_character<char, 4>(
248 0xE0 | (ch >> 12), 0x80 | ((ch >> 6) & 0x3F), 0x80 | (ch & 0x3F));
249 return Encoded_character<char, 4>(0xF0 | (ch >> 18),
250 0x80 | ((ch >> 12) & 0x3F),
251 0x80 | ((ch >> 6) & 0x3F),
252 0x80 | (ch & 0x3F));
253 }
254
255 template <typename Input_iterator, typename Sentinel>
256 typename std::char_traits<char32_t>::int_type decode_utf16(
257 Input_iterator &iter,
258 Sentinel sentinel,
259 bool allow_unpaired_surrogate_code_units = true,
260 typename std::char_traits<char32_t>::int_type error_value =
261 replacement_character) noexcept(noexcept(++iter) && noexcept(static_cast<char16_t>(*iter))
262 && noexcept(iter == sentinel ? 0 : 0))
263 {
264 if(iter == sentinel)
265 return std::char_traits<char32_t>::eof();
266 auto unit0 = static_cast<std::uint16_t>(static_cast<char16_t>(*iter));
267 ++iter;
268 if(unit0 >= 0xD800U && unit0 < 0xDC00U)
269 {
270 if(iter == sentinel)
271 return allow_unpaired_surrogate_code_units ? unit0 : error_value;
272 auto unit1 = static_cast<std::uint16_t>(static_cast<char16_t>(*iter));
273 if(unit1 < 0xDC00U || unit1 >= 0xE000U)
274 return allow_unpaired_surrogate_code_units ? unit0 : error_value;
275 ++iter;
276 return 0x10000UL + ((unit0 & 0x3FF) << 10) + (unit1 & 0x3FF);
277 }
278 return unit0;
279 }
280
281 Encoded_character<char16_t, 2> encode_utf16(char32_t ch) noexcept
282 {
283 assert(ch < 0x10FFFFUL && ch >= 0);
284 if(ch < 0x10000UL)
285 return Encoded_character<char16_t, 2>(ch);
286 return Encoded_character<char16_t, 2>(0xD800U | ((ch - 0x10000UL) >> 10),
287 0xDC00U | ((ch - 0x10000UL) & 0x3FF));
288 }
289
290 template <typename Input_iterator, typename Sentinel>
291 typename std::char_traits<char32_t>::int_type decode_utf32(
292 Input_iterator &iter,
293 Sentinel sentinel,
294 bool allow_Surrogate_Code_Units = true,
295 typename std::char_traits<char32_t>::int_type error_value =
296 replacement_character) noexcept(noexcept(++iter) && noexcept(static_cast<char32_t>(*iter))
297 && noexcept(iter == sentinel ? 0 : 0))
298 {
299 if(iter == sentinel)
300 return std::char_traits<char32_t>::eof();
301 auto retval = static_cast<std::uint32_t>(static_cast<char32_t>(*iter));
302 ++iter;
303 if(retval > 0x10FFFFUL)
304 return error_value;
305 if(!allow_Surrogate_Code_Units && retval >= 0xD800U && retval < 0xE000U)
306 return error_value;
307 return retval;
308 }
309
310 Encoded_character<char32_t, 1> encode_utf32(char32_t ch) noexcept
311 {
312 return Encoded_character<char32_t, 1>(ch);
313 }
314
315 static_assert(std::numeric_limits<wchar_t>::radix == 2, "");
316 static_assert(std::numeric_limits<wchar_t>::digits
317 + static_cast<std::size_t>(std::is_signed<wchar_t>::value)
318 >= 16,
319 "");
320
321 constexpr bool is_wide_character_utf16 = std::numeric_limits<wchar_t>::digits <= 16;
322
323 Encoded_character<wchar_t, 2> encode_wide(char32_t ch) noexcept
324 {
325 if(is_wide_character_utf16)
326 {
327 auto result = encode_utf16(ch);
328 Encoded_character<wchar_t, 2> retval;
329 retval.used = result.used;
330 for(std::size_t i = 0; i < result.size(); i++)
331 {
332 retval[i] = static_cast<wchar_t>(result[i]);
333 }
334 return retval;
335 }
336 return Encoded_character<wchar_t, 2>(static_cast<wchar_t>(ch));
337 }
338
339 template <typename Input_iterator, typename Sentinel>
340 typename std::char_traits<char32_t>::int_type decode_wide(
341 Input_iterator &iter,
342 Sentinel sentinel,
343 bool allow_unpaired_surrogate_code_units = true,
344 typename std::char_traits<char32_t>::int_type error_value =
345 replacement_character) noexcept(noexcept(++iter) && noexcept(static_cast<wchar_t>(*iter))
346 && noexcept(iter == sentinel ? 0 : 0))
347 {
348 struct Iterator_wrapper
349 {
350 Input_iterator &iter;
351 Iterator_wrapper(Input_iterator &iter) : iter(iter)
352 {
353 }
354 void operator++()
355 {
356 ++iter;
357 }
358 wchar_t operator*()
359 {
360 return static_cast<wchar_t>(*iter);
361 }
362 bool operator==(Sentinel &sentinel)
363 {
364 return iter == sentinel;
365 }
366 };
367 Iterator_wrapper iterator_wrapper(iter);
368 if(is_wide_character_utf16)
369 return decode_utf16(iterator_wrapper,
370 std::move(sentinel),
371 allow_unpaired_surrogate_code_units,
372 error_value);
373 return decode_utf32(
374 iterator_wrapper, std::move(sentinel), allow_unpaired_surrogate_code_units, error_value);
375 }
376
377 struct Convert_options final
378 {
379 typename std::char_traits<char32_t>::int_type error_value = replacement_character;
380 bool allow_unpaired_surrogate_code_points = true;
381 bool allow_2_byte_null = false;
382 bool use_2_byte_null = false;
383 constexpr Convert_options()
384 {
385 }
386 constexpr Convert_options(typename std::char_traits<char32_t>::int_type error_value,
387 bool allow_unpaired_surrogate_code_points,
388 bool allow_2_byte_null,
389 bool use_2_byte_null)
390 : error_value(error_value),
391 allow_unpaired_surrogate_code_points(allow_unpaired_surrogate_code_points),
392 allow_2_byte_null(allow_2_byte_null),
393 use_2_byte_null(use_2_byte_null)
394 {
395 }
396 static constexpr Convert_options strict(
397 typename std::char_traits<char32_t>::int_type error_value = replacement_character)
398 {
399 return Convert_options(error_value, false, false, false);
400 }
401 static constexpr Convert_options java(
402 typename std::char_traits<char32_t>::int_type error_value = replacement_character)
403 {
404 return Convert_options(error_value, true, true, true);
405 }
406 };
407
408 template <typename Char_type>
409 struct Decode_encode_functions
410 {
411 template <typename Input_iterator, typename Sentinel>
412 static typename std::char_traits<char32_t>::int_type decode(
413 Input_iterator &iter, Sentinel sentinel, const Convert_options &convert_options) = delete;
414 static Encoded_character<Char_type, 1> encode(
415 char32_t ch, const Convert_options &convert_options) noexcept = delete;
416 };
417
418 template <>
419 struct Decode_encode_functions<char>
420 {
421 template <typename Input_iterator, typename Sentinel>
422 static typename std::char_traits<char32_t>::int_type decode(
423 Input_iterator &iter,
424 Sentinel sentinel,
425 const Convert_options
426 &convert_options) noexcept(noexcept(decode_utf8(std::declval<Input_iterator &>(),
427 std::declval<Sentinel &&>())))
428 {
429 return decode_utf8(iter,
430 std::move(sentinel),
431 convert_options.allow_unpaired_surrogate_code_points,
432 convert_options.allow_2_byte_null,
433 convert_options.error_value);
434 }
435 static Encoded_character<char, 4> encode(char32_t ch,
436 const Convert_options &convert_options) noexcept
437 {
438 return encode_utf8(ch, convert_options.use_2_byte_null);
439 }
440 };
441
442 template <>
443 struct Decode_encode_functions<char16_t>
444 {
445 template <typename Input_iterator, typename Sentinel>
446 static typename std::char_traits<char32_t>::int_type decode(
447 Input_iterator &iter,
448 Sentinel sentinel,
449 const Convert_options
450 &convert_options) noexcept(noexcept(decode_utf16(std::declval<Input_iterator &>(),
451 std::declval<Sentinel &&>())))
452 {
453 return decode_utf16(iter,
454 std::move(sentinel),
455 convert_options.allow_unpaired_surrogate_code_points,
456 convert_options.error_value);
457 }
458 static Encoded_character<char16_t, 2> encode(char32_t ch,
459 const Convert_options &convert_options) noexcept
460 {
461 return encode_utf16(ch);
462 }
463 };
464
465 template <>
466 struct Decode_encode_functions<char32_t>
467 {
468 template <typename Input_iterator, typename Sentinel>
469 static typename std::char_traits<char32_t>::int_type decode(
470 Input_iterator &iter,
471 Sentinel sentinel,
472 const Convert_options
473 &convert_options) noexcept(noexcept(decode_utf32(std::declval<Input_iterator &>(),
474 std::declval<Sentinel &&>())))
475 {
476 return decode_utf32(iter,
477 std::move(sentinel),
478 convert_options.allow_unpaired_surrogate_code_points,
479 convert_options.error_value);
480 }
481 static Encoded_character<char32_t, 1> encode(char32_t ch,
482 const Convert_options &convert_options) noexcept
483 {
484 return encode_utf32(ch);
485 }
486 };
487
488 template <>
489 struct Decode_encode_functions<wchar_t>
490 {
491 template <typename Input_iterator, typename Sentinel>
492 static typename std::char_traits<char32_t>::int_type decode(
493 Input_iterator &iter,
494 Sentinel sentinel,
495 const Convert_options
496 &convert_options) noexcept(noexcept(decode_wide(std::declval<Input_iterator &>(),
497 std::declval<Sentinel &&>())))
498 {
499 return decode_wide(iter,
500 std::move(sentinel),
501 convert_options.allow_unpaired_surrogate_code_points,
502 convert_options.error_value);
503 }
504 static Encoded_character<wchar_t, 2> encode(char32_t ch,
505 const Convert_options &convert_options) noexcept
506 {
507 return encode_wide(ch);
508 }
509 };
510
511 namespace detail
512 {
513 template <typename Target, typename Source>
514 struct String_cast_helper;
515
516 template <typename Target_Char_type,
517 typename Target_Traits,
518 typename Target_Allocator,
519 typename Source_Char_type,
520 typename Source_Traits>
521 struct String_cast_helper<std::basic_string<Target_Char_type, Target_Traits, Target_Allocator>,
522 basic_string_view<Source_Char_type, Source_Traits>>
523 {
524 static std::basic_string<Target_Char_type, Target_Traits, Target_Allocator> run(
525 basic_string_view<Source_Char_type, Source_Traits> source,
526 const Convert_options &convert_options)
527 {
528 std::basic_string<Target_Char_type, Target_Traits, Target_Allocator> retval;
529 for(auto iter = source.begin(); iter != source.end();)
530 {
531 retval = std::move(retval) + Decode_encode_functions<Target_Char_type>::encode(
532 Decode_encode_functions<Source_Char_type>::decode(
533 iter, source.end(), convert_options),
534 convert_options);
535 }
536 return retval;
537 }
538 };
539
540 template <typename Char_type,
541 typename Target_Traits,
542 typename Target_Allocator,
543 typename Source_Traits>
544 struct String_cast_helper<std::basic_string<Char_type, Target_Traits, Target_Allocator>,
545 basic_string_view<Char_type, Source_Traits>>
546 {
547 static std::basic_string<Char_type, Target_Traits, Target_Allocator> run(
548 basic_string_view<Char_type, Source_Traits> source, const Convert_options &)
549 {
550 return std::basic_string<Char_type, Target_Traits, Target_Allocator>(source.begin(),
551 source.end());
552 }
553 };
554 }
555
556 template <typename Target, typename Source_Char_type, typename Source_Traits>
557 Target string_cast(basic_string_view<Source_Char_type, Source_Traits> source,
558 const Convert_options &convert_options)
559 {
560 return detail::String_cast_helper<Target, basic_string_view<Source_Char_type, Source_Traits>>::
561 run(source, convert_options);
562 }
563
564 template <typename Target, typename Source_Char_type, typename Source_Traits>
565 Target string_cast(basic_string_view<Source_Char_type, Source_Traits> source)
566 {
567 return detail::String_cast_helper<Target, basic_string_view<Source_Char_type, Source_Traits>>::
568 run(source, Convert_options());
569 }
570 }
571 }
572 }
573
574 #endif /* UTIL_TEXT_H_ */