HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
utf_8.hpp
Go to the documentation of this file.
1// Copyright Take Vos 2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
9#pragma once
10
11#include "char_converter.hpp"
12#include "cp_1252.hpp"
13#include <bit>
14
15hi_warning_push();
16// C26490: Don't use reinterpret_cast.
17// Needed for SIMD intrinsics.
18hi_warning_ignore_msvc(26490);
19
20namespace hi { inline namespace v1 {
21
25template<>
26struct char_map<"utf-8"> {
27 using char_type = char;
28 using fallback_encoder_type = char_map<"cp-1252">;
29 using fallback_char_type = fallback_encoder_type::char_type;
30
31 [[nodiscard]] constexpr std::endian guess_endian(void const *ptr, size_t size, std::endian endian) const noexcept
32 {
33 return std::endian::native;
34 }
35
36 [[nodiscard]] constexpr std::pair<char32_t, bool> read_fallback(char const cu) const noexcept
37 {
38 hilet str = std::string_view(&cu, 1_uz);
39 auto first = str.begin();
40 hilet[code_point, valid] = fallback_encoder_type{}.read(first, str.end());
41 return {code_point, false};
42 }
43
44 template<typename It, typename EndIt>
45 [[nodiscard]] hi_no_inline constexpr std::pair<char32_t, bool> read(It& it, EndIt last, char8_t first_cu) const noexcept
46 {
47 if (it == last or (first_cu & 0xc0) == 0x80) {
48 // A non-ASCII character at the end of string.
49 // or an unexpected continuation code-unit should be treated as CP-1252.
50 return read_fallback(char_cast<char>(first_cu));
51
52 } else {
53 hilet length = narrow_cast<uint8_t>(std::countl_one(char_cast<uint8_t>(first_cu)));
54 hi_axiom(length >= 2);
55
56 // First part of the code-point.
57 auto cu = first_cu;
58 cu <<= length;
59 cu >>= length;
60 auto cp = char_cast<char32_t>(cu);
61
62 // Read the first continuation code-unit which is always here.
63 cu = char_cast<char8_t>(*it);
64
65 cp <<= 6;
66 cp |= cu & 0x3f;
67 if ((cu & 0xc0) != 0x80) {
68 // If the second code-unit is not a UTF-8 continuation character, treat the first
69 // code-unit as if it was CP-1252.
70 return read_fallback(char_cast<char>(first_cu));
71 }
72
73 // If there are a start and a continuation code-unit in a sequence we consider this to be properly UTF-8 encoded.
74 // So from this point any errors are replaced with 0xfffd.
75 ++it;
76
77 for (uint8_t actual_length = 2; actual_length != length; ++actual_length) {
78 if (it == last) {
79 // End-of-file
80 return {0xfffd, false};
81 }
82
83 cu = char_cast<char8_t>(*it);
84 if ((cu & 0b11'000000) != 0b10'000000) {
85 // Unexpected end of sequence.
86 return {0xfffd, false};
87 }
88
89 ++it;
90
91 // Shift in the next 6 bits.
92 cp <<= 6;
93 cp |= cu & 0b00'111111;
94 }
95
96 auto valid = true;
97 // Valid range.
98 valid &= cp < 0x11'0000;
99 // Not a surrogate.
100 valid &= cp < 0xd800 or cp >= 0xe000;
101 // Not overlong encoded.
102 valid &= length == narrow_cast<uint8_t>((cp > 0x7f) + (cp > 0x7ff) + (cp > 0xffff) + 1);
103 if (not valid) {
104 return {0xfffd, false};
105 }
106 return {cp, true};
107 }
108 }
109
110 template<typename It, typename EndIt>
111 [[nodiscard]] constexpr std::pair<char32_t, bool> read(It& it, EndIt last) const noexcept
112 {
113 hi_axiom(it != last);
114
115 auto cu = char_cast<char8_t>(*it);
116 ++it;
117 if (not to_bool(cu & 0x80)) [[likely]] {
118 // ASCII character.
119 return {char_cast<char32_t>(cu), true};
120
121 } else {
122 return read(it, last, cu);
123 }
124 }
125
126 [[nodiscard]] constexpr std::pair<uint8_t, bool> size(char32_t code_point) const noexcept
127 {
128 hi_axiom(code_point < 0x11'0000);
129 hi_axiom(not(code_point >= 0xd800 and code_point < 0xe000));
130
131 return {narrow_cast<uint8_t>((code_point > 0x7f) + (code_point > 0x7ff) + (code_point > 0xffff) + 1), true};
132 }
133
134 template<typename It>
135 constexpr void write(char32_t code_point, It& dst) const noexcept
136 {
137 hi_axiom(code_point < 0x11'0000);
138 hi_axiom(not(code_point >= 0xd800 and code_point < 0xe000));
139
140 hilet num_cu = truncate<uint8_t>((code_point > 0x7f) + (code_point > 0x7ff) + (code_point > 0xffff));
141
142 auto leading_ones = char_cast<int8_t>(uint8_t{0x80});
143 leading_ones >>= num_cu;
144 if (num_cu == 0) {
145 leading_ones = 0;
146 }
147
148 auto shift = num_cu * 6;
149
150 auto cu = truncate<uint8_t>(code_point >> shift);
151 cu |= truncate<uint8_t>(leading_ones);
152
153 // We can't cast `cu` to `dst` since it is not possible to get the
154 // value_type of an output-iterator, specifically the
155 // std::back_insert_iterator.
156 *dst = cu;
157 ++dst;
158
159 while (shift) {
160 shift -= 6;
161
162 cu = truncate<uint8_t>(code_point >> shift);
163 cu &= 0b00'111111;
164 cu |= 0b10'000000;
165
166 // We can't cast `cu` to `dst` since it is not possible to get the
167 // value_type of an output-iterator, specifically the
168 // std::back_insert_iterator.
169 *dst = cu;
170 ++dst;
171 }
172 }
173
174#if defined(HI_HAS_SSE2)
175 template<typename It>
176 hi_force_inline __m128i read_ascii_chunk16(It it) const noexcept
177 {
178 return _mm_loadu_si128(reinterpret_cast<__m128i const *>(std::addressof(*it)));
179 }
180
181 template<typename It>
182 hi_force_inline void write_ascii_chunk16(__m128i chunk, It dst) const noexcept
183 {
184 _mm_storeu_si128(reinterpret_cast<__m128i *>(std::addressof(*dst)), chunk);
185 }
186#endif
187};
188
189}} // namespace hi::v1
190
191hi_warning_pop();
Definition of the CP-1252 / Windows-1252 character map.
Definition of the char_converter<From,To> functor.
#define hi_axiom(expression,...)
Specify an axiom; an expression that is true.
Definition assert.hpp:253
#define hilet
Invariant should be the default for variables.
Definition utility.hpp:23
@ read
Allow read access to a file.
@ write
Allow write access to a file.
DOXYGEN BUG.
Definition algorithm.hpp:13
geometry/margins.hpp
Definition cache.hpp:11
Character encoder/decoder template.
Definition char_converter.hpp:83
T addressof(T... args)