HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
Unicode.hpp
1// Copyright 2020 Pokitec
2// All rights reserved.
3
4#include "TTauri/Foundation/required.hpp"
5#include "TTauri/Foundation/endian.hpp"
6#include "TTauri/Foundation/CP1252.hpp"
7#include "TTauri/Foundation/math.hpp"
8#include <string_view>
9#include <cstdint>
10
11#pragma once
12
13namespace tt {
14
15constexpr char32_t UnicodeASCIIEnd = 0x7f;
16constexpr char32_t UnicodePlane0End = 0xffff;
17constexpr char32_t UnicodePlane1Begin = 0x010000;
18constexpr char32_t UnicodePlane16End = 0x10ffff;
19constexpr char32_t UnicodeSurrogatesBegin = 0xd800;
20constexpr char32_t UnicodeSurrogatesEnd = 0xdfff;
21constexpr char32_t UnicodeHighSurrogatesBegin = 0xd800;
22constexpr char32_t UnicodeHighSurrogatesEnd = 0xdbff;
23constexpr char32_t UnicodeLowSurrogatesBegin = 0xdc00;
24constexpr char32_t UnicodeLowSurrogatesEnd = 0xdfff;
25constexpr char32_t UnicodeBasicMultilinqualPlaneEnd = UnicodePlane0End;
26constexpr char32_t UnicodeMax = UnicodePlane16End;
27constexpr char32_t UnicodeZeroWidthNoBreakSpace = 0xfeff;
28constexpr char32_t UnicodeBOM = UnicodeZeroWidthNoBreakSpace;
29constexpr char32_t UnicodeReplacementCharacter = 0xfffd;
30constexpr char32_t UnicodeNonCharacterFFFE = 0xfffe;
31constexpr char32_t UnicodeNonCharacterFFFF = 0xffff;
32constexpr char32_t UnicodeReverseBOM = UnicodeNonCharacterFFFE;
33
34[[nodiscard]] inline std::u32string splitLigature(char32_t x) noexcept
35{
36 switch (x) {
37 case 0xfb00: return { 0x0066, 0x0066 }; // ff
38 case 0xfb01: return { 0x0066, 0x0069 }; // fi
39 case 0xfb02: return { 0x0066, 0x006c }; // fl
40 case 0xfb03: return { 0x0066, 0x0066, 0x0069 }; // ffi
41 case 0xfb04: return { 0x0066, 0x0066, 0x006c }; // ffl
42 case 0xfb05: return { 0x017f, 0x0074 }; // long st
43 case 0xfb06: return { 0x0073, 0x0074 }; // st
44
45 case 0xfb13: return { 0x0574, 0x0576 }; // men now
46 case 0xfb14: return { 0x0574, 0x0565 }; // men ech
47 case 0xfb15: return { 0x0574, 0x056b }; // men ini
48 case 0xfb16: return { 0x057e, 0x0576 }; // vew now
49 case 0xfb17: return { 0x0574, 0x056d }; // men xeh
50
51 default: return {};
52 }
53}
54
55[[nodiscard]] constexpr char32_t utf32_validate(char32_t c) noexcept
56{
57 return (
58 (c > UnicodeMax) ||
59 (c >= UnicodeSurrogatesBegin && c <= UnicodeSurrogatesEnd) ||
60 (c == UnicodeNonCharacterFFFE) ||
61 (c == UnicodeNonCharacterFFFF)
62 ) ?
63 UnicodeReplacementCharacter :
64 c;
65}
66
67template<typename UnaryOperation>
68[[nodiscard]] inline std::u16string u16string_transform(std::u16string_view str, UnaryOperation unary_op) noexcept
69{
70 auto r = std::u16string{};
71 r.reserve(ssize(str));
72
73 std::transform(str.cbegin(), str.cend(), std::back_inserter(r), unary_op);
74 return r;
75}
76
77[[nodiscard]] inline std::u16string u16string_byte_swap(std::u16string_view str) noexcept
78{
79 return u16string_transform(str, [](ttlet &c) { return byte_swap(c); });
80}
81
82[[nodiscard]] inline std::u16string u16string_little_to_native(std::u16string_view str) noexcept
83{
84 return u16string_transform(str, [](ttlet &c) { return little_to_native(c); });
85}
86
87[[nodiscard]] inline std::u16string u16string_big_to_native(std::u16string_view str) noexcept
88{
89 return u16string_transform(str, [](ttlet &c) { return big_to_native(c); });
90}
91
99[[nodiscard]] constexpr char16_t utf32_to_utf16(char32_t c, int &state) noexcept
100{
101 if (c >= UnicodePlane1Begin) {
102 c -= UnicodePlane1Begin;
103 if (state < 0) {
104 c >>= 10;
105 state = 1;
106 return static_cast<char16_t>(UnicodeHighSurrogatesBegin + c);
107 } else {
108 c &= 0x3ff;
109 state = 0;
110 return static_cast<char16_t>(UnicodeLowSurrogatesBegin + c);
111 }
112
113 } else {
114 state = 0;
115 return static_cast<char16_t>(c);
116 }
117}
118
127[[nodiscard]] constexpr char32_t utf16_to_utf32(char16_t c, uint32_t &state) noexcept
128{
129 if (state == 0) {
130 if (c >= UnicodeHighSurrogatesBegin && c <= UnicodeHighSurrogatesEnd) {
131 state = static_cast<uint32_t>(c - UnicodeHighSurrogatesBegin) << 18 | 1;
132 return 0;
133 } else if (c >= UnicodeLowSurrogatesBegin && c <= UnicodeLowSurrogatesEnd) {
134 return UnicodeReplacementCharacter;
135 } else {
136 return static_cast<char32_t>(c);
137 }
138 } else {
139 if (c >= UnicodeLowSurrogatesBegin && c <= UnicodeLowSurrogatesEnd) {
140 ttlet upper10bits = static_cast<char32_t>(state >> 8);
141 ttlet lower10bits = static_cast<char32_t>(c - UnicodeLowSurrogatesBegin);
142 state = 0;
143 return (upper10bits | lower10bits) + UnicodePlane1Begin;
144 } else {
145 state = 0;
146 return UnicodeReplacementCharacter;
147 }
148 }
149}
150
158[[nodiscard]] constexpr char utf32_to_utf8(char32_t c, int &state) noexcept
159{
160 if (state < 0) {
161 if (c <= 0x7f) {
162 state = 0;
163 return static_cast<char>(c);
164 } else if (c <= 0x07ff) {
165 state = 6;
166 return static_cast<char>((c >> state) | 0xc0);
167 } else if (c <= 0xffff) {
168 state = 12;
169 return static_cast<char>((c >> state) | 0xe0);
170 } else {
171 state = 18;
172 return static_cast<char>((c >> state) | 0xf0);
173 }
174
175 } else {
176 state -= 6;
177 return static_cast<char>(((c >> state) & 0x3f) | 0x80);
178 }
179}
180
181[[nodiscard]] tt_no_inline char32_t utf8_to_utf32_fallback(char c) noexcept
182{
183 return CP1252_to_UTF32(c);
184}
185
187 int trailing_bytes;
188 char32_t code;
189
190 utf8_to_utf32_state() noexcept : trailing_bytes(0) {}
191};
192
202[[nodiscard]] constexpr char32_t utf8_to_utf32(char c, utf8_to_utf32_state &state) noexcept
203{
204 auto c_ = static_cast<uint8_t>(c);
205
206 if (state.trailing_bytes) {
207 if ((c_ & 0xc0) == 0x80) {
208 --state.trailing_bytes;
209 state.code <<= 6;
210 state.code |= (c_ & 0x3f);
211 return state.trailing_bytes ? 0 : state.code;
212
213 } else {
214 state.trailing_bytes = 0;
215 return utf8_to_utf32_fallback(c_);
216 }
217
218 } else {
219 ttlet inv_c32 = static_cast<uint32_t>(static_cast<uint8_t>(~c_));
220 ttlet nr_data_bits = bsr(inv_c32);
221
222 state.trailing_bytes = 6 - nr_data_bits;
223 if (state.trailing_bytes < 0) {
224 // 0b0xxxxxxx
225 state.trailing_bytes = 0;
226 return c_ & 0x7f;
227
228 } else if (state.trailing_bytes > 0 && state.trailing_bytes <= 3) {
229 // 0b110xxxxx, 0b1110xxxx, 0b11110xxx,
230 ttlet data_mask = (1 << nr_data_bits) - 1;
231 state.code = (c_ & data_mask);
232 return 0;
233
234 } else {
235 // 0b10xxxxx
236 // 0b111110xx, 0b1111110x, 0b11111110
237 state.trailing_bytes = 0;
238 return utf8_to_utf32_fallback(c_);
239 }
240 }
241}
242
245[[nodiscard]] inline std::string to_string(std::u32string_view rhs) noexcept {
246 auto r = std::string{};
247 r.reserve(ssize(rhs));
248
249 for (auto c: rhs) {
250 c = utf32_validate(c);
251
252 int state = -1;
253 do {
254 r += utf32_to_utf8(c, state);
255 } while (state);
256 }
257
258 return r;
259}
260
263[[nodiscard]] inline std::u16string to_u16string(std::u32string_view rhs) noexcept {
264 auto r = std::u16string{};
265 r.reserve(ssize(rhs));
266
267 for (auto c: rhs) {
268 c = utf32_validate(c);
269
270 int state = -1;
271 do {
272 r += utf32_to_utf16(c, state);
273 } while (state);
274 }
275
276 return r;
277}
278
279#if WCHAR_MAX < 65536
282[[nodiscard]] inline std::wstring to_wstring(std::u32string_view rhs) noexcept {
283 auto r = std::wstring{};
284 r.reserve(ssize(rhs));
285
286 for (auto c: rhs) {
287 c = utf32_validate(c);
288
289 int state = -1;
290 do {
291 r += static_cast<wchar_t>(utf32_to_utf16(c, state));
292 } while (state);
293 }
294
295 return r;
296}
297#else
300[[nodiscard]] inline std::wstring to_wstring(std::u32string_view rhs) noexcept {
301 auto r = std::wstring{};
302 r.reserve(ssize(rhs));
303
304 for (auto c: rhs) {
305 r += static_cast<wchar_t>(c);
306 }
307
308 return r;
309}
310#endif
311
314[[nodiscard]] inline std::u32string to_u32string(std::string_view rhs) noexcept {
315 auto r = std::u32string{};
316 r.reserve(ssize(rhs));
317
318 auto state = utf8_to_utf32_state{};
319 for (ttlet u: rhs) {
320 if (auto c = utf8_to_utf32(u, state)) {
321 r += utf32_validate(c);
322 }
323 }
324
325 return r;
326}
327
330[[nodiscard]] inline std::u32string to_u32string(std::u16string_view rhs) noexcept {
331 auto r = std::u32string{};
332 r.reserve(ssize(rhs));
333
334 auto swapped_str = std::u16string{};
335 if (ssize(rhs) != 0 && rhs.front() == UnicodeReverseBOM) {
336 swapped_str = u16string_byte_swap(rhs);
337 rhs = std::u16string_view{swapped_str};
338 }
339
340 uint32_t state = 0;
341 for (ttlet u: rhs) {
342 if (auto c = utf16_to_utf32(u, state)) {
343 tt_assume(c <= UnicodeMax);
344 r += utf32_validate(c);
345 }
346 }
347
348 return r;
349}
350
351#if WCHAR_MAX < 65536
354[[nodiscard]] inline std::u32string to_u32string(std::wstring_view rhs) noexcept {
355 auto r = std::u32string{};
356 r.reserve(ssize(rhs));
357
358 uint32_t state = 0;
359 for (ttlet u: rhs) {
360 if (auto c = utf16_to_utf32(static_cast<char16_t>(u), state)) {
361 r += utf32_validate(c);
362 }
363 }
364
365 return r;
366}
367#else
370[[nodiscard]] inline std::u32string to_u32string(std::wstring_view rhs) noexcept {
371 auto r = std::u32string{};
372 r.reserve(ssize(rhs));
373
374 for (ttlet c: rhs) {
375 r += static_cast<wchar_t>(c);
376 }
377
378 return r;
379}
380#endif
381
384[[nodiscard]] inline std::string to_string(std::u16string_view rhs) noexcept {
385 return to_string(to_u32string(rhs));
386}
387
390[[nodiscard]] inline std::string to_string(std::wstring_view rhs) noexcept {
391 return to_string(to_u32string(rhs));
392}
393
396[[nodiscard]] inline std::u16string to_u16string(std::string_view rhs) noexcept {
397 return to_u16string(to_u32string(rhs));
398}
399
402[[nodiscard]] inline std::wstring to_wstring(std::string_view rhs) noexcept {
403 return to_wstring(to_u32string(rhs));
404}
405
406}
407
Definition Unicode.hpp:186
T back_inserter(T... args)
T reserve(T... args)
T to_string(T... args)
T to_wstring(T... args)
T transform(T... args)