HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
utf_8.hpp
1// Copyright Take Vos 2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "char_converter.hpp"
8#include "cp_1252.hpp"
9#include <bit>
10
11namespace hi::inline v1 {
12
13template<>
14struct char_map<"utf-8"> {
15 using char_type = char;
16 using fallback_encoder_type = char_map<"cp-1252">;
17 using fallback_char_type = fallback_encoder_type::char_type;
18
19 [[nodiscard]] constexpr std::endian guess_endian(void const *ptr, size_t size, std::endian endian) const noexcept
20 {
21 return std::endian::native;
22 }
23
24 template<typename It, typename EndIt>
25 [[nodiscard]] constexpr std::pair<char32_t, bool> read_fallback(It& it, EndIt last) const noexcept
26 {
27 hilet[code_point, valid] = fallback_encoder_type{}.read(it, last);
28 return {code_point, false};
29 }
30
31 template<typename It, typename EndIt>
32 [[nodiscard]] constexpr std::pair<char32_t, bool> read(It& it, EndIt last) const noexcept
33 {
34 hi_axiom(it != last);
35
36 auto cu = *it++;
37 if (not to_bool(cu & 0x80)) [[likely]] {
38 // ASCII character.
39 return {char_cast<char32_t>(cu), true};
40
41 } else if (it == last or (cu & 0xc0) == 0x80) [[unlikely]] {
42 // A non-ASCII character at the end of string.
43 // or an unexpected continuation code-unit should be treated as CP-1252.
44 --it;
45 return read_fallback(it, last);
46
47 } else {
48 auto length = narrow_cast<uint8_t>(std::countl_one(char_cast<uint8_t>(cu)));
49 auto todo = length - 2;
50 hi_axiom(length >= 2);
51
52 // First part of the code-point.
53 auto cp = char_cast<char32_t>(cu & (0x7f >> length));
54
55 // Read the first continuation code-unit which is always here.
56 cu = *it++;
57 cp <<= 6;
58 cp |= cu & 0x3f;
59 if ((cu & 0xc0) != 0x80) [[unlikely]] {
60 // If the second code-unit is not a UTF-8 continuation character, treat the first
61 // code-unit as if it was CP-1252.
62 it -= 2;
63 return read_fallback(it, last);
64
65 } else if (todo >= std::distance(it, last)) [[unlikely]] {
66 // If there is a start and a continuation code-unit in a row we consider this to be UTF-8 encoded.
67 // So at this point any errors are replaced with 0xfffd.
68 it = last;
69 return {0xfffd, false};
70 }
71
72 while (todo--) {
73 cu = *it++;
74 cp <<= 6;
75 cp |= cu & 0x3f;
76 if ((cu & 0xc0) != 0x80) [[unlikely]] {
77 // Unexpected end of sequence.
78 --it;
79 return {0xfffd, false};
80 }
81 }
82
83 auto valid = true;
84 // Valid range.
85 valid &= cp < 0x11'0000;
86 // Not a surrogate.
87 valid &= cp < 0xd800 or cp >= 0xe000;
88 // Not overlong encoded.
89 valid &= length == narrow_cast<uint8_t>((cp > 0x7f) + (cp > 0x7ff) + (cp > 0xffff) + 1);
90 if (not valid) [[unlikely]] {
91 return {0xfffd, false};
92 }
93 return {cp, true};
94 }
95 };
96
97 [[nodiscard]] constexpr std::pair<uint8_t, bool> size(char32_t code_point) const noexcept
98 {
99 hi_axiom(code_point < 0x11'0000);
100 hi_axiom(not(code_point >= 0xd800 and code_point < 0xe000));
101
102 return {narrow_cast<uint8_t>((code_point > 0x7f) + (code_point > 0x7ff) + (code_point > 0xffff) + 1), true};
103 }
104
105 template<typename It>
106 constexpr void write(char32_t code_point, It &dst) const noexcept
107 {
108 hi_axiom(code_point < 0x11'0000);
109 hi_axiom(not(code_point >= 0xd800 and code_point < 0xe000));
110
111 auto length = truncate<uint8_t>((code_point > 0x7f) + (code_point > 0x7ff) + (code_point > 0xffff));
112 if (auto i = length) {
113 do {
114 dst[i] = truncate<char8_t>((code_point & 0x3f) | 0x80);
115 code_point >>= 6;
116 } while (--i);
117
118 code_point |= 0x780 >> length;
119 }
120 dst[0] = truncate<char8_t>(code_point);
121 dst += length + 1;
122 }
123
124#if defined(HI_HAS_SSE2)
125 template<typename It>
126 hi_force_inline __m128i read_ascii_chunk16(It it) const noexcept
127 {
128 return _mm_loadu_si128(reinterpret_cast<__m128i const *>(std::addressof(*it)));
129 }
130
131 template<typename It>
132 hi_force_inline void write_ascii_chunk16(__m128i chunk, It dst) const noexcept
133 {
134 _mm_storeu_si128(reinterpret_cast<__m128i *>(std::addressof(*dst)), chunk);
135 }
136#endif
137};
138
139} // namespace hi::inline v1
#define hilet
Invariant should be the default for variables.
Definition required.hpp:23
Character encoder/decoder template.
Definition char_converter.hpp:34
Definition cp_1252.hpp:82
T addressof(T... args)
T distance(T... args)