HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
utf_8.hpp
Go to the documentation of this file.
1// Copyright Take Vos 2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
9#pragma once
10
11#include "../macros.hpp"
12#include "../utility/utility.hpp"
13#include "char_converter.hpp"
14#include "cp_1252.hpp"
15#include <bit>
16#include <utility>
17#include <cstddef>
18#include <compare>
19#if defined(HI_HAS_SSE2)
20#include <emmintrin.h>
21#endif
22
23hi_export_module(hikogui.char_maps.utf_8);
24
25hi_warning_push();
26// C26490: Don't use reinterpret_cast.
27// Needed for SIMD intrinsics.
28hi_warning_ignore_msvc(26490);
29
30hi_export namespace hi { inline namespace v1 {
31
35template<>
36struct char_map<"utf-8"> {
37 using char_type = char;
38 using fallback_encoder_type = char_map<"cp-1252">;
39 using fallback_char_type = fallback_encoder_type::char_type;
40
41 [[nodiscard]] constexpr std::endian guess_endian(void const *ptr, size_t size, std::endian endian) const noexcept
42 {
43 return std::endian::native;
44 }
45
46 [[nodiscard]] constexpr std::pair<char32_t, bool> read_fallback(char const cu) const noexcept
47 {
48 auto const str = std::string_view(&cu, 1_uz);
49 auto first = str.begin();
50 auto const[code_point, valid] = fallback_encoder_type{}.read(first, str.end());
51 return {code_point, false};
52 }
53
54 template<typename It, typename EndIt>
55 [[nodiscard]] hi_no_inline constexpr std::pair<char32_t, bool> read(It& it, EndIt last, char8_t first_cu) const noexcept
56 {
57 if (it == last or (first_cu & 0xc0) == 0x80) {
58 // A non-ASCII character at the end of string.
59 // or an unexpected continuation code-unit should be treated as CP-1252.
60 return read_fallback(char_cast<char>(first_cu));
61
62 } else {
63 auto const length = narrow_cast<uint8_t>(std::countl_one(char_cast<uint8_t>(first_cu)));
64 hi_axiom(length >= 2);
65
66 // First part of the code-point.
67 auto cu = first_cu;
68 cu <<= length;
69 cu >>= length;
70 auto cp = char_cast<char32_t>(cu);
71
72 // Read the first continuation code-unit which is always here.
73 cu = char_cast<char8_t>(*it);
74
75 cp <<= 6;
76 cp |= cu & 0x3f;
77 if ((cu & 0xc0) != 0x80) {
78 // If the second code-unit is not a UTF-8 continuation character, treat the first
79 // code-unit as if it was CP-1252.
80 return read_fallback(char_cast<char>(first_cu));
81 }
82
83 // If there are a start and a continuation code-unit in a sequence we consider this to be properly UTF-8 encoded.
84 // So from this point any errors are replaced with 0xfffd.
85 ++it;
86
87 for (uint8_t actual_length = 2; actual_length != length; ++actual_length) {
88 if (it == last) {
89 // End-of-file
90 return {0xfffd, false};
91 }
92
93 cu = char_cast<char8_t>(*it);
94 if ((cu & 0b11'000000) != 0b10'000000) {
95 // Unexpected end of sequence.
96 return {0xfffd, false};
97 }
98
99 ++it;
100
101 // Shift in the next 6 bits.
102 cp <<= 6;
103 cp |= cu & 0b00'111111;
104 }
105
106 auto valid = true;
107 // Valid range.
108 valid &= cp < 0x11'0000;
109 // Not a surrogate.
110 valid &= cp < 0xd800 or cp >= 0xe000;
111 // Not overlong encoded.
112 valid &= length == narrow_cast<uint8_t>((cp > 0x7f) + (cp > 0x7ff) + (cp > 0xffff) + 1);
113 if (not valid) {
114 return {0xfffd, false};
115 }
116 return {cp, true};
117 }
118 }
119
120 template<typename It, typename EndIt>
121 [[nodiscard]] constexpr std::pair<char32_t, bool> read(It& it, EndIt last) const noexcept
122 {
123 hi_axiom(it != last);
124
125 auto cu = char_cast<char8_t>(*it);
126 ++it;
127 if (not to_bool(cu & 0x80)) [[likely]] {
128 // ASCII character.
129 return {char_cast<char32_t>(cu), true};
130
131 } else {
132 return read(it, last, cu);
133 }
134 }
135
136 [[nodiscard]] constexpr std::pair<uint8_t, bool> size(char32_t code_point) const noexcept
137 {
138 hi_axiom(code_point < 0x11'0000);
139 hi_axiom(not(code_point >= 0xd800 and code_point < 0xe000));
140
141 return {narrow_cast<uint8_t>((code_point > 0x7f) + (code_point > 0x7ff) + (code_point > 0xffff) + 1), true};
142 }
143
144 template<typename It>
145 constexpr void write(char32_t code_point, It& dst) const noexcept
146 {
147 hi_axiom(code_point < 0x11'0000);
148 hi_axiom(not(code_point >= 0xd800 and code_point < 0xe000));
149
150 auto const num_cu = truncate<uint8_t>((code_point > 0x7f) + (code_point > 0x7ff) + (code_point > 0xffff));
151
152 auto leading_ones = char_cast<int8_t>(uint8_t{0x80});
153 leading_ones >>= num_cu;
154 if (num_cu == 0) {
155 leading_ones = 0;
156 }
157
158 auto shift = num_cu * 6;
159
160 auto cu = truncate<uint8_t>(code_point >> shift);
161 cu |= truncate<uint8_t>(leading_ones);
162
163 // We can't cast `cu` to `dst` since it is not possible to get the
164 // value_type of an output-iterator, specifically the
165 // std::back_insert_iterator.
166 *dst = cu;
167 ++dst;
168
169 while (shift) {
170 shift -= 6;
171
172 cu = truncate<uint8_t>(code_point >> shift);
173 cu &= 0b00'111111;
174 cu |= 0b10'000000;
175
176 // We can't cast `cu` to `dst` since it is not possible to get the
177 // value_type of an output-iterator, specifically the
178 // std::back_insert_iterator.
179 *dst = cu;
180 ++dst;
181 }
182 }
183
184#if defined(HI_HAS_SSE2)
185 template<typename It>
186 hi_force_inline __m128i read_ascii_chunk16(It it) const noexcept
187 {
188 return _mm_loadu_si128(reinterpret_cast<__m128i const *>(std::addressof(*it)));
189 }
190
191 template<typename It>
192 hi_force_inline void write_ascii_chunk16(__m128i chunk, It dst) const noexcept
193 {
194 _mm_storeu_si128(reinterpret_cast<__m128i *>(std::addressof(*dst)), chunk);
195 }
196#endif
197};
198
199}} // namespace hi::v1
200
201hi_warning_pop();
Definition of the CP-1252 / Windows-1252 character map.
Definition of the char_converter<From,To> functor.
@ read
Allow read access to a file.
@ write
Allow write access to a file.
The HikoGUI namespace.
Definition array_generic.hpp:20
DOXYGEN BUG.
Definition algorithm_misc.hpp:20
Character encoder/decoder template.
Definition char_converter.hpp:89
T addressof(T... args)