HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
UTF.hpp
1// Copyright Take Vos 2020-2021.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "../required.hpp"
8#include "../endian.hpp"
9#include "../CP1252.hpp"
10#include <type_traits>
11#include <iterator>
12#include <bit>
13
14namespace tt {
15
26template<typename Iterator>
27[[nodiscard]] constexpr char32_t utf16_to_utf32(Iterator &it) noexcept
28{
29 using value_type = typename std::iterator_traits<Iterator>::value_type;
30 static_assert(std::is_same_v<value_type, char16_t>, "Iterator must point to a char16_t");
31
32 ttlet first = *(it++);
33 if (first <= 0xd7ff || first >= 0xe000) {
34 return first;
35
36 } else {
37 tt_axiom(first <= 0xdbff, "Expecting the high surrogate");
38 ttlet second = *(it++);
39 tt_axiom(second >= 0xdc00 && second <= 0xdfff, "Expecting the low surrogate");
40
41 return ((static_cast<char32_t>(first - 0xd800) << 10) | (static_cast<char32_t>(second - 0xdc00))) + 0x01'0000;
42 }
43}
44
55template<typename Iterator>
56[[nodiscard]] constexpr char32_t utf8_to_utf32(Iterator &it) noexcept
57{
58 using value_type = typename std::iterator_traits<Iterator>::value_type;
59 static_assert(std::is_same_v<value_type, char8_t>, "Iterator must point to a char8_t");
60
61 auto cp = char32_t{0};
62
63 ttlet cu = *(it++);
64 if (cu <= 0x7f) {
65 return static_cast<char32_t>(cu);
66
67 } else if (cu <= 0xdf) {
68 tt_axiom(cu >= 0xc0, "UTF-8 encoded code-point can not start with continuation code-units");
69 cp = static_cast<char32_t>(cu & 0x1f);
70 cp <<= 6;
71 cp |= static_cast<char32_t>(*(it++) & 0x3f);
72 tt_axiom(cp >= 0x0080 && cp <= 0x07ff, "UTF-8 Overlong encoding");
73 return cp;
74
75 } else if (cu <= 0xef) {
76 cp = static_cast<char32_t>(cu & 0x0f);
77 cp <<= 6;
78 cp |= static_cast<char32_t>(*(it++) & 0x3f);
79 cp <<= 6;
80 cp |= static_cast<char32_t>(*(it++) & 0x3f);
81 tt_axiom(cp >= 0x0800 && cp <= 0xffff, "UTF-8 Overlong encoding");
82 tt_axiom(!(cp >= 0xd800 && cp <= 0xdfff), "UTF-8 Must not encode surrogates");
83 return cp;
84
85 } else {
86 tt_axiom(cu <= 0xf7, "UTF8 encoded code-point must have a valid start code-unit");
87 cp = static_cast<char32_t>(cu & 0x07);
88 cp <<= 6;
89 cp |= static_cast<char32_t>(*(it++) & 0x3f);
90 cp <<= 6;
91 cp |= static_cast<char32_t>(*(it++) & 0x3f);
92 cp <<= 6;
93 cp |= static_cast<char32_t>(*(it++) & 0x3f);
94 tt_axiom(cp >= 0x100000 && cp <= 0x10ffff, "UTF-8 Overlong encoding");
95 return cp;
96 }
97}
98
111template<typename Iterator>
112constexpr bool utf8_to_utf32(Iterator &it, Iterator last, char32_t &code_point) noexcept
113{
114 using value_type = typename std::iterator_traits<Iterator>::value_type;
115 static_assert(std::is_same_v<value_type, char8_t>, "Iterator must point to a char8_t");
116
117 auto continuation_count = 0;
118 auto first_cu = *(it++);
119 if (first_cu <= 0x7f) {
120 code_point = first_cu;
121 return true;
122
123 } else if (first_cu <= 0xbf) {
124 // Invalid continuation character.
125 code_point = CP1252_to_UTF32(static_cast<char>(first_cu));
126 return false;
127
128 } else if (first_cu <= 0xdf) {
129 code_point = static_cast<char32_t>(first_cu & 0x1f);
130 continuation_count = 1;
131
132 } else if (first_cu <= 0xef) {
133 code_point = static_cast<char32_t>(first_cu & 0x0f);
134 continuation_count = 2;
135
136 } else if (first_cu <= 0xf7) {
137 code_point = static_cast<char32_t>(first_cu & 0x07);
138 continuation_count = 3;
139
140 } else {
141 // Invalid code-unit
142 code_point = CP1252_to_UTF32(static_cast<char>(first_cu));
143 return false;
144 }
145
146 auto old_it = it;
147 for (int i = 0; i != continuation_count; ++i) {
148 if (it == last || (*it & 0xc0) != 0x80) {
149 // No continuation character, or at end of string.
150 [[unlikely]] code_point = CP1252_to_UTF32(static_cast<char>(first_cu));
151 it = old_it;
152 return false;
153 }
154
155 code_point <<= 6;
156 code_point = *(it++) & 0x3f;
157 }
158
159 if ((code_point >= 0xd800 && code_point <= 0xdfff) || // Surrogate pair
160 (continuation_count == 1 && code_point < 0x0080) || // Overlong
161 (continuation_count == 2 && code_point < 0x0800) || // Overlong
162 (continuation_count == 3 && code_point < 0x10000) // Overlong
163 ) {
164 // Surrogate pair
165 code_point = CP1252_to_UTF32(static_cast<char>(first_cu));
166 it = old_it;
167 return false;
168 }
169
170 return true;
171}
172
181template<typename BackInsertIterator>
182constexpr void utf32_to_utf16(char32_t code_point, BackInsertIterator &it) noexcept
183{
184 using value_type = typename BackInsertIterator::container_type::value_type;
185 static_assert(sizeof(value_type) == 2, "Iterator must point to a two byte character type");
186
187 if (code_point <= 0xffff) {
188 tt_axiom(!(code_point >= 0xd800 && code_point <= 0xdfff), "Code Point must not be a surrogate-code");
189 *(it++) = static_cast<value_type>(code_point);
190
191 } else {
192 tt_axiom(code_point <= 0x10ffff, "Code Point must be in range of the 17 planes");
193
194 code_point -= 0x10000;
195 *(it++) = static_cast<value_type>(code_point >> 10) | 0xd800;
196 *(it++) = static_cast<value_type>(code_point) & 0x03ff | 0xdc00;
197 }
198}
199
208template<typename BackInsertIterator>
209constexpr void utf32_to_utf8(char32_t code_point, BackInsertIterator &it) noexcept
210{
211 using value_type = typename BackInsertIterator::container_type::value_type;
212 static_assert(sizeof(value_type) == 1, "UTF-8 values must be stored in a 1 byte character type");
213
214 if (code_point <= 0x7f) {
215 *(it++) = static_cast<value_type>(code_point);
216
217 } else if (code_point <= 0x07ff) {
218 *(it++) = static_cast<value_type>(code_point >> 6) | 0xc0;
219 *(it++) = static_cast<value_type>(code_point) & 0x3f | 0x80;
220
221 } else if (code_point <= 0xffff) {
222 tt_axiom(!(code_point >= 0xd800 && code_point <= 0xdfff), "Code Point must not be a surrogate");
223 *(it++) = static_cast<value_type>(code_point >> 12) | 0xe0;
224 *(it++) = static_cast<value_type>(code_point >> 6) & 0x3f | 0x80;
225 *(it++) = static_cast<value_type>(code_point) & 0x3f | 0x80;
226
227 } else {
228 tt_axiom(code_point <= 0x10ffff, "Code Point must be in range of the 17 planes");
229 *(it++) = static_cast<value_type>(code_point >> 18) | 0xf0;
230 *(it++) = static_cast<value_type>(code_point >> 12) & 0x3f | 0x80;
231 *(it++) = static_cast<value_type>(code_point >> 6) & 0x3f | 0x80;
232 *(it++) = static_cast<value_type>(code_point) & 0x3f | 0x80;
233 }
234}
235
243[[nodiscard]] inline std::u32string sanitize_u32string(std::u32string &&rhs) noexcept
244{
245 auto r = std::move(rhs);
246
247 for (char32_t &c : r) {
248 if (c > 0x10'ffff || (c >= 0xd800 && c <= 0xdfff)) {
249 c = U'\ufffd';
250 }
251 }
252
253 return r;
254}
255
268template<typename Container, std::endian Endian = std::endian::native>
269[[nodiscard]] std::u16string make_u16string(Container const &rhs) noexcept
270{
271 auto r = std::u16string{};
272
273 if constexpr (sizeof(Container::value_type) == 1) {
274 // A byte array of some sorts, copy each pair of bytes.
275 r.reserve((size(rhs) + 1) / 2);
276 for (ssize_t i = 0; i < ssize(rhs); i += 2) {
277 if constexpr (Endian == std::endian::little) {
278 r += static_cast<char16_t>(rhs[i]) << 8 | static_cast<char16_t>(rhs[i + 1]);
279 } else {
280 r += static_cast<char16_t>(rhs[i]) | static_cast<char16_t>(rhs[i + 1]) << 8;
281 }
282 }
283 if (size(rhs) % 2 == 1) {
284 // Odd number of bytes.
285 r += 0xfffd;
286 }
287
288 } else {
289 // An array of 16-bits or larger.
290 r.reserve(size(rhs));
291 for (auto &&c : rhs) {
292 r += Endian == std::endian::native ? static_cast<char16_t>(c) : static_cast<char16_t>(byte_swap(c));
293 }
294 }
295
296 return r;
297}
298
307[[nodiscard]] inline std::u16string sanitize_u16string(std::u16string &&rhs) noexcept
308{
309 auto r = std::move(rhs);
310
311 auto swap_endian = false;
312
313 ttlet length = size(r);
314 auto i = 0;
315 while (i != length) {
316 auto code_unit = r[i];
317 if (swap_endian) {
318 code_unit = r[i] = byte_swap(code_unit);
319 }
320
321 if (code_unit == 0xfffe) {
322 // Byte-swapped BOM.
323 swap_endian = !swap_endian;
324 // Don't increment iterator
325
326 } else if (code_unit >= 0xd800 && code_unit <= 0xdbff) {
327 // Found high surrogate.
328 auto old_i = i++;
329 if (i != length) {
330 auto next_code_unit = r[i];
331 if (next_code_unit >= 0xdc00 && next_code_unit <= 0xdfff) {
332 // valid surrogate pair
333 ++i;
334
335 } else {
336 // Invalid surrogate pair.
337 // Replace the high-surrogate, then sync back to the current code-unit.
338 r[old_i] = char16_t{0xfffd};
339 }
340
341 } else {
342 // High surrogate at end of string.
343 r[old_i] = char16_t{0xfffd};
344 }
345
346 } else if (code_unit >= 0xdc00 && code_unit <= 0xdfff) {
347 // Found invalid low surrogate.
348 r[i++] = char16_t{0xfffd};
349
350 } else {
351 ++i;
352 }
353 }
354
355 return r;
356}
357
358template<typename Container>
359[[nodiscard]] std::u8string make_u8string(Container const &rhs) noexcept
360{
361 auto r = std::u8string{};
362
363 // An array of 8-bits.
364 r.reserve(size(rhs));
365 for (auto &&c : rhs) {
366 r += static_cast<char8_t>(c);
367 }
368
369 return r;
370}
371
380[[nodiscard]] inline std::u8string sanitize_u8string(std::u8string &&rhs) noexcept
381{
382 auto r = std::move(rhs);
383
384 ttlet first = begin(rhs);
385 ttlet last = end(rhs);
386
387 auto code_point = char32_t{};
388 auto valid = true;
389 auto old_it = first;
390 for (auto it = first; valid && it != last;) {
391 old_it = it;
392 valid &= utf8_to_utf32(it, last, code_point);
393 }
394
395 if (valid) {
396 return r;
397 }
398
399 // Copy the valid UTF-8 code units and prepare for
400 // re-encoding the rest of the string.
401 auto tmp = std::u8string{begin(r), old_it};
402 tmp.reserve(size(r));
403
404 // Add the last decoded code point.
405 auto tmp_i = std::back_inserter(tmp);
406 utf32_to_utf8(code_point, tmp_i);
407
408 // Re-encode the rest of the string.
409 for (auto it = old_it + 1; it != last;) {
410 utf8_to_utf32(it, last, code_point);
411 utf32_to_utf8(code_point, tmp_i);
412 }
413
414 std::swap(r, tmp);
415 return r;
416}
417
418namespace detail {
419
420template<typename StringT>
421[[nodiscard]] inline StringT to_u8string(std::u16string_view const &rhs) noexcept
422{
423 auto r = StringT{};
424 r.reserve(rhs.size());
425 auto r_it = std::back_inserter(r);
426
427 for (auto it = std::begin(rhs); it != std::end(rhs);) {
428 ttlet c32 = utf16_to_utf32(it);
429 utf32_to_utf8(c32, r_it);
430 }
431 return r;
432}
433
434template<typename StringT>
435[[nodiscard]] inline StringT to_u8string(std::u32string_view const &rhs) noexcept
436{
437 auto r = StringT{};
438 r.reserve(rhs.size());
439 auto r_it = std::back_inserter(r);
440
441 for (auto c32 : rhs) {
442 utf32_to_utf8(c32, r_it);
443 }
444 return r;
445}
446
447template<typename StringT>
448[[nodiscard]] inline StringT to_u8string(std::wstring_view const &rhs) noexcept
449{
450 if constexpr (sizeof(std::wstring::value_type) == 2) {
451 auto s16 = sanitize_u16string(std::u16string{reinterpret_cast<char16_t const *>(rhs.data()), rhs.size()});
452 return to_u8string<StringT>(std::move(s16));
453 } else {
454 auto s32 = sanitize_u32string(std::u32string{reinterpret_cast<char32_t const *>(rhs.data()), rhs.size()});
455 return to_u8string<StringT>(std::move(s32));
456 }
457}
458
459} // namespace detail
460
467[[nodiscard]] inline std::string to_string(std::u8string_view const &rhs) noexcept
468{
469 return std::string{reinterpret_cast<char const *>(rhs.data()), rhs.size()};
470}
471
478[[nodiscard]] inline std::u8string to_u8string(std::u16string_view const &rhs) noexcept
479{
480 return detail::to_u8string<std::u8string>(rhs);
481}
482
489[[nodiscard]] inline std::string to_string(std::u16string_view const &rhs) noexcept
490{
491 return detail::to_u8string<std::string>(rhs);
492}
493
500[[nodiscard]] inline std::u8string to_u8string(std::u32string_view const &rhs) noexcept
501{
502 return detail::to_u8string<std::u8string>(rhs);
503}
504
511[[nodiscard]] inline std::string to_string(std::u32string_view const &rhs) noexcept
512{
513 return detail::to_u8string<std::string>(rhs);
514}
515
522[[nodiscard]] inline std::u16string to_u16string(std::u8string_view const &rhs) noexcept
523{
524 auto r = std::u16string{};
525 r.reserve(rhs.size());
526 auto r_it = std::back_inserter(r);
527
528 for (auto it = std::begin(rhs); it != std::end(rhs);) {
529 ttlet c32 = utf8_to_utf32(it);
530 utf32_to_utf16(c32, r_it);
531 }
532 return r;
533}
534
541[[nodiscard]] inline std::u16string to_u16string(std::u32string_view const &rhs) noexcept
542{
543 auto r = std::u16string{};
544 r.reserve(rhs.size());
545 auto r_it = std::back_inserter(r);
546
547 for (auto c32 : rhs) {
548 utf32_to_utf16(c32, r_it);
549 }
550 return r;
551}
552
559[[nodiscard]] inline std::u32string to_u32string(std::u8string_view const &rhs) noexcept
560{
561 auto r = std::u32string{};
562 r.reserve(rhs.size());
563
564 for (auto it = std::begin(rhs); it != std::end(rhs);) {
565 r += utf8_to_utf32(it);
566 }
567 return r;
568}
569
576[[nodiscard]] inline std::u32string to_u32string(std::u16string_view const &rhs) noexcept
577{
578 auto r = std::u32string{};
579 r.reserve(rhs.size());
580
581 for (auto it = std::begin(rhs); it != std::end(rhs);) {
582 r += utf16_to_utf32(it);
583 }
584 return r;
585}
586
593[[nodiscard]] inline std::u8string to_u8string(std::string_view const &rhs) noexcept
594{
595 return sanitize_u8string(std::u8string{reinterpret_cast<char8_t const *>(rhs.data()), rhs.size()});
596}
597
604[[nodiscard]] inline std::u16string to_u16string(std::string_view const &rhs) noexcept
605{
606 return to_u16string(sanitize_u8string(std::u8string{reinterpret_cast<char8_t const *>(rhs.data()), rhs.size()}));
607}
608
615[[nodiscard]] inline std::u32string to_u32string(std::string_view const &rhs) noexcept
616{
617 return to_u32string(sanitize_u8string(std::u8string{reinterpret_cast<char8_t const *>(rhs.data()), rhs.size()}));
618}
619
626[[nodiscard]] inline std::u8string to_u8string(std::wstring_view const &rhs) noexcept
627{
628 return detail::to_u8string<std::u8string>(rhs);
629}
630
637[[nodiscard]] inline std::string to_string(std::wstring_view const &rhs) noexcept
638{
639 return detail::to_u8string<std::string>(rhs);
640}
641
648[[nodiscard]] inline std::u16string to_u16string(std::wstring_view const &rhs) noexcept
649{
650 if constexpr (sizeof(std::wstring::value_type) == 2) {
651 return sanitize_u16string(std::u16string{reinterpret_cast<char16_t const *>(rhs.data()), rhs.size()});
652 } else {
653 auto s32 = sanitize_u32string(std::u32string{reinterpret_cast<char32_t const *>(rhs.data()), rhs.size()});
654 return to_u16string(std::move(s32));
655 }
656}
657
664[[nodiscard]] inline std::u32string to_u32string(std::wstring_view const &rhs) noexcept
665{
666 if constexpr (sizeof(std::wstring::value_type) == 2) {
667 auto s16 = sanitize_u16string(std::u16string{reinterpret_cast<char16_t const *>(rhs.data()), rhs.size()});
668 return to_u32string(std::move(s16));
669 } else {
670 return sanitize_u32string(std::u32string{reinterpret_cast<char32_t const *>(rhs.data()), rhs.size()});
671 }
672}
673
680[[nodiscard]] inline std::wstring to_wstring(std::u8string_view const &rhs) noexcept
681{
682 if constexpr (sizeof(std::wstring::value_type) == 2) {
683 auto s16 = to_u16string(rhs);
684 return std::wstring{reinterpret_cast<wchar_t const *>(s16.data()), s16.size()};
685 } else {
686 auto s32 = to_u32string(rhs);
687 return std::wstring{reinterpret_cast<wchar_t const *>(s32.data()), s32.size()};
688 }
689}
690
697[[nodiscard]] inline std::wstring to_wstring(std::string_view const &rhs) noexcept
698{
699 auto s8 = sanitize_u8string(std::u8string{reinterpret_cast<char8_t const *>(rhs.data()), rhs.size()});
700 return to_wstring(s8);
701}
702
709[[nodiscard]] inline std::wstring to_wstring(std::u16string_view const &rhs) noexcept
710{
711 if constexpr (sizeof(std::wstring::value_type) == 2) {
712 return std::wstring{reinterpret_cast<wchar_t const *>(rhs.data()), rhs.size()};
713 } else {
714 auto s32 = to_u32string(rhs);
715 return std::wstring{reinterpret_cast<wchar_t const *>(s32.data()), s32.size()};
716 }
717}
718
725[[nodiscard]] inline std::wstring to_wstring(std::u32string_view const &rhs) noexcept
726{
727 if constexpr (sizeof(std::wstring::value_type) == 2) {
728 auto s16 = to_u16string(rhs);
729 return std::wstring{reinterpret_cast<wchar_t const *>(s16.data()), s16.size()};
730 } else {
731 return std::wstring{reinterpret_cast<wchar_t const *>(rhs.data()), rhs.size()};
732 }
733}
734
735} // namespace tt
T back_inserter(T... args)
T begin(T... args)
T end(T... args)
T move(T... args)
T reserve(T... args)
T size(T... args)
T swap(T... args)
T to_string(T... args)
T to_wstring(T... args)