HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
utf_8.hpp
Go to the documentation of this file.
1// Copyright Take Vos 2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
9#pragma once
10
11#include "../macros.hpp"
12#include "char_converter.hpp"
13#include "cp_1252.hpp"
14#include <bit>
15#include <utility>
16#include <cstddef>
17
18hi_warning_push();
19// C26490: Don't use reinterpret_cast.
20// Needed for SIMD intrinsics.
21hi_warning_ignore_msvc(26490);
22
23namespace hi { inline namespace v1 {
24
28template<>
29struct char_map<"utf-8"> {
30 using char_type = char;
31 using fallback_encoder_type = char_map<"cp-1252">;
32 using fallback_char_type = fallback_encoder_type::char_type;
33
34 [[nodiscard]] constexpr std::endian guess_endian(void const *ptr, size_t size, std::endian endian) const noexcept
35 {
36 return std::endian::native;
37 }
38
39 [[nodiscard]] constexpr std::pair<char32_t, bool> read_fallback(char const cu) const noexcept
40 {
41 hilet str = std::string_view(&cu, 1_uz);
42 auto first = str.begin();
43 hilet[code_point, valid] = fallback_encoder_type{}.read(first, str.end());
44 return {code_point, false};
45 }
46
47 template<typename It, typename EndIt>
48 [[nodiscard]] hi_no_inline constexpr std::pair<char32_t, bool> read(It& it, EndIt last, char8_t first_cu) const noexcept
49 {
50 if (it == last or (first_cu & 0xc0) == 0x80) {
51 // A non-ASCII character at the end of string.
52 // or an unexpected continuation code-unit should be treated as CP-1252.
53 return read_fallback(char_cast<char>(first_cu));
54
55 } else {
56 hilet length = narrow_cast<uint8_t>(std::countl_one(char_cast<uint8_t>(first_cu)));
57 hi_axiom(length >= 2);
58
59 // First part of the code-point.
60 auto cu = first_cu;
61 cu <<= length;
62 cu >>= length;
64
65 // Read the first continuation code-unit which is always here.
67
68 cp <<= 6;
69 cp |= cu & 0x3f;
70 if ((cu & 0xc0) != 0x80) {
71 // If the second code-unit is not a UTF-8 continuation character, treat the first
72 // code-unit as if it was CP-1252.
73 return read_fallback(char_cast<char>(first_cu));
74 }
75
76 // If there are a start and a continuation code-unit in a sequence we consider this to be properly UTF-8 encoded.
77 // So from this point any errors are replaced with 0xfffd.
78 ++it;
79
80 for (uint8_t actual_length = 2; actual_length != length; ++actual_length) {
81 if (it == last) {
82 // End-of-file
83 return {0xfffd, false};
84 }
85
87 if ((cu & 0b11'000000) != 0b10'000000) {
88 // Unexpected end of sequence.
89 return {0xfffd, false};
90 }
91
92 ++it;
93
94 // Shift in the next 6 bits.
95 cp <<= 6;
96 cp |= cu & 0b00'111111;
97 }
98
99 auto valid = true;
100 // Valid range.
101 valid &= cp < 0x11'0000;
102 // Not a surrogate.
103 valid &= cp < 0xd800 or cp >= 0xe000;
104 // Not overlong encoded.
105 valid &= length == narrow_cast<uint8_t>((cp > 0x7f) + (cp > 0x7ff) + (cp > 0xffff) + 1);
106 if (not valid) {
107 return {0xfffd, false};
108 }
109 return {cp, true};
110 }
111 }
112
113 template<typename It, typename EndIt>
114 [[nodiscard]] constexpr std::pair<char32_t, bool> read(It& it, EndIt last) const noexcept
115 {
116 hi_axiom(it != last);
117
118 auto cu = char_cast<char8_t>(*it);
119 ++it;
120 if (not to_bool(cu & 0x80)) [[likely]] {
121 // ASCII character.
122 return {char_cast<char32_t>(cu), true};
123
124 } else {
125 return read(it, last, cu);
126 }
127 }
128
129 [[nodiscard]] constexpr std::pair<uint8_t, bool> size(char32_t code_point) const noexcept
130 {
131 hi_axiom(code_point < 0x11'0000);
132 hi_axiom(not(code_point >= 0xd800 and code_point < 0xe000));
133
134 return {narrow_cast<uint8_t>((code_point > 0x7f) + (code_point > 0x7ff) + (code_point > 0xffff) + 1), true};
135 }
136
137 template<typename It>
138 constexpr void write(char32_t code_point, It& dst) const noexcept
139 {
140 hi_axiom(code_point < 0x11'0000);
141 hi_axiom(not(code_point >= 0xd800 and code_point < 0xe000));
142
143 hilet num_cu = truncate<uint8_t>((code_point > 0x7f) + (code_point > 0x7ff) + (code_point > 0xffff));
144
145 auto leading_ones = char_cast<int8_t>(uint8_t{0x80});
147 if (num_cu == 0) {
148 leading_ones = 0;
149 }
150
151 auto shift = num_cu * 6;
152
153 auto cu = truncate<uint8_t>(code_point >> shift);
155
156 // We can't cast `cu` to `dst` since it is not possible to get the
157 // value_type of an output-iterator, specifically the
158 // std::back_insert_iterator.
159 *dst = cu;
160 ++dst;
161
162 while (shift) {
163 shift -= 6;
164
165 cu = truncate<uint8_t>(code_point >> shift);
166 cu &= 0b00'111111;
167 cu |= 0b10'000000;
168
169 // We can't cast `cu` to `dst` since it is not possible to get the
170 // value_type of an output-iterator, specifically the
171 // std::back_insert_iterator.
172 *dst = cu;
173 ++dst;
174 }
175 }
176
177#if defined(HI_HAS_SSE2)
178 template<typename It>
179 hi_force_inline __m128i read_ascii_chunk16(It it) const noexcept
180 {
181 return _mm_loadu_si128(reinterpret_cast<__m128i const *>(std::addressof(*it)));
182 }
183
184 template<typename It>
185 hi_force_inline void write_ascii_chunk16(__m128i chunk, It dst) const noexcept
186 {
187 _mm_storeu_si128(reinterpret_cast<__m128i *>(std::addressof(*dst)), chunk);
188 }
189#endif
190};
191
192}} // namespace hi::v1
193
194hi_warning_pop();
Definition of the CP-1252 / Windows-1252 character map.
Definition of the char_converter<From,To> functor.
@ read
Allow read access to a file.
@ write
Allow write access to a file.
DOXYGEN BUG.
Definition algorithm.hpp:16
geometry/margins.hpp
Definition lookahead_iterator.hpp:5
constexpr Out narrow_cast(In const &rhs) noexcept
Cast numeric values without loss of precision.
Definition cast.hpp:377
Character encoder/decoder template.
Definition char_converter.hpp:86
T addressof(T... args)