HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
UTF.hpp
1// Copyright Take Vos 2021.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "../required.hpp"
8#include "../cast.hpp"
9#include <string>
10#include <string_view>
11#include <bit>
12#include <type_traits>
13#include <concepts>
14
15namespace hi::inline v1 {
16namespace detail {
17
18template<typename CharT>
19constexpr void append_code_point(std::basic_string<CharT> &r, uint32_t code_point) noexcept;
20
21template<typename CharT>
22constexpr void append_code_point(std::basic_string<CharT> &r, uint32_t code_point) noexcept requires(sizeof(CharT) == 1)
23{
24 if (code_point < 0x80) {
25 r += static_cast<CharT>(code_point);
26 } else if (code_point < 0x800) {
27 r += static_cast<CharT>((code_point >> 6) | 0xc0);
28 r += static_cast<CharT>((code_point & 0x3f) | 0x80);
29 } else if (code_point < 0xd800) {
30 r += static_cast<CharT>((code_point >> 12) | 0xe0);
31 r += static_cast<CharT>(((code_point >> 6) & 0x3f) | 0x80);
32 r += static_cast<CharT>((code_point & 0x3f) | 0x80);
33 } else if (code_point < 0xe000) {
34 append_code_point(r, 0xfffd);
35 } else if (code_point < 0x1'0000) {
36 r += static_cast<CharT>((code_point >> 12) | 0xe0);
37 r += static_cast<CharT>(((code_point >> 6) & 0x3f) | 0x80);
38 r += static_cast<CharT>((code_point & 0x3f) | 0x80);
39 } else if (code_point < 0x11'0000) {
40 r += static_cast<CharT>((code_point >> 18) | 0xf0);
41 r += static_cast<CharT>(((code_point >> 12) & 0x3f) | 0x80);
42 r += static_cast<CharT>(((code_point >> 6) & 0x3f) | 0x80);
43 r += static_cast<CharT>((code_point & 0x3f) | 0x80);
44 } else {
45 append_code_point(r, 0xfffd);
46 }
47}
48
49template<typename CharT>
50constexpr void append_code_point(std::basic_string<CharT> &r, uint32_t code_point) noexcept requires(sizeof(CharT) == 2)
51{
52 if (code_point < 0xd800) {
53 r += static_cast<CharT>(code_point);
54 } else if (code_point < 0xe000) {
55 append_code_point(r, 0xfffd);
56 } else if (code_point < 0x1'0000) {
57 r += static_cast<CharT>(code_point);
58 } else if (code_point < 0x11'0000) {
59 code_point -= 0x1'0000;
60 r += static_cast<CharT>(0xd800 + (code_point >> 10));
61 r += static_cast<CharT>(0xdc00 + (code_point & 0x3ff));
62 } else {
63 append_code_point(r, 0xfffd);
64 }
65}
66
67template<typename CharT>
68constexpr void append_code_point(std::basic_string<CharT> &r, uint32_t code_point) noexcept requires(sizeof(CharT) == 4)
69{
70 if (code_point < 0xd800) {
71 r += static_cast<char32_t>(code_point);
72 } else if (code_point < 0xe000) {
73 append_code_point(r, 0xfffd);
74 } else if (code_point < 0x11'0000) {
75 r += static_cast<char32_t>(code_point);
76 } else {
77 append_code_point(r, 0xfffd);
78 }
79}
80
81template<typename ToCharT, typename FromCharT>
82[[nodiscard]] constexpr std::size_t guess_num_code_units(std::basic_string_view<FromCharT> const &rhs) noexcept
83{
84 return size(rhs);
85}
86
87template<typename ToCharT, typename FromCharT>
88[[nodiscard]] constexpr std::size_t guess_num_code_units(std::basic_string_view<FromCharT> const &rhs) noexcept
89 requires(sizeof(FromCharT) == 1 and sizeof(ToCharT) > 1)
90{
91 std::size_t r = 0;
92 for (hilet c : rhs) {
93 if ((c & 0xc0) != 0x80) {
94 ++r;
95 }
96 }
97 return r;
98}
99
100template<typename ToCharT, typename FromCharT>
101[[nodiscard]] constexpr std::size_t guess_num_code_units(std::basic_string_view<FromCharT> const &rhs) noexcept
102 requires(sizeof(FromCharT) > 1 and sizeof(ToCharT) == 1)
103{
104 std::size_t r = 0;
105 for (hilet c : rhs) {
106 if (c < 0x80) {
107 ++r;
108 } else if (c < 0x800) {
109 r += 2;
110 } else if (c < 0x1'0000) {
111 r += 3;
112 } else {
113 r += 4;
114 }
115 }
116 return r;
117}
118
119template<typename ToCharT, typename FromCharT>
120[[nodiscard]] constexpr std::basic_string<ToCharT> from_utf8(std::basic_string_view<FromCharT> rhs) noexcept
121{
122 static_assert(sizeof(FromCharT) == 1, "Expect UTF-8 string to be in a 8-bit char type.");
123
125 r.reserve(guess_num_code_units<ToCharT>(rhs));
126
127 auto it = begin(rhs);
128 hilet last = end(rhs);
129
130 uint32_t code_point = 0;
131 int todo = 0;
132 int num = 0;
133 while (it != last) {
134 hilet c = static_cast<uint8_t>(*it);
135
136 if (todo == 0) {
137 ++it;
138
139 if (c < 0x80) {
140 todo = num = 1;
141 code_point = c;
142
143 } else if (c < 0xc0) {
144 // Invalid continuation character.
145 todo = 1;
146 num = 3;
147 code_point = 0xfffd;
148
149 } else if (c < 0xe0) {
150 todo = num = 2;
151 code_point = c & 0x1f;
152
153 } else if (c < 0xf0) {
154 todo = num = 3;
155 code_point = c & 0x0f;
156
157 } else if (c < 0xf8) {
158 todo = num = 4;
159 code_point = c & 0x07;
160
161 } else {
162 // Invalid code-unit.
163 todo = 1;
164 num = 3;
165 code_point = 0xfffd;
166 }
167
168 } else if ((c & 0xc0) == 0x80) {
169 ++it;
170 code_point <<= 6;
171 code_point |= c & 0x3f;
172
173 } else {
174 // Unexpected non-continuation characters. Redo the current character as a start character.
175 todo = 1;
176 num = 3;
177 code_point = 0xfffd;
178 }
179
180 if (--todo == 0) {
181 if ((num == 2 and code_point < 0x80) or (num == 3 and code_point < 0x800) or (num == 4 and code_point < 0x1'0000)) {
182 // Overlong encoding.
183 code_point = 0xfffd;
184 }
185
186 append_code_point(r, code_point);
187 }
188 }
189
190 if (todo) {
191 append_code_point(r, 0xfffd);
192 }
193
194 return r;
195}
196
197template<typename ToCharT, typename FromCharT>
198[[nodiscard]] constexpr std::basic_string<ToCharT> from_utf16(std::basic_string_view<FromCharT> rhs) noexcept
199{
200 static_assert(sizeof(FromCharT) == 2, "Expect UTF-16 string to be in a 16-bit char type.");
201
203 r.reserve(guess_num_code_units<ToCharT>(rhs));
204
205 auto it = begin(rhs);
206 hilet last = end(rhs);
207
208 uint32_t code_point = 0;
209 int todo = 0;
210 while (it != last) {
211 hilet c = *it;
212
213 if (todo == 0) {
214 ++it;
215 if (c < 0xd800) {
216 todo = 1;
217 code_point = c;
218
219 } else if (c < 0xdc00) {
220 todo = 2;
221 code_point = (c - 0xd800) << 10;
222
223 } else if (c < 0xe000) {
224 // Invalid low surrogate.
225 todo = 1;
226 code_point = 0xfffd;
227
228 } else {
229 todo = 1;
230 code_point = c;
231 }
232
233 } else if (c >= 0xdc00 and c < 0xe000) {
234 ++it;
235 code_point |= (c - 0xdc00);
236 code_point += 0x1'0000;
237
238 } else {
239 // Missing low surrogate, redo the current code-unit.
240 todo = 1;
241 code_point = 0xfffd;
242 }
243
244 if (--todo == 0) {
245 append_code_point(r, code_point);
246 }
247 }
248
249 if (todo) {
250 append_code_point(r, 0xfffd);
251 }
252
253 return r;
254}
255
256template<typename ToCharT, typename FromCharT>
257[[nodiscard]] constexpr std::basic_string<ToCharT> from_utf32(std::basic_string_view<FromCharT> rhs) noexcept
258{
259 static_assert(sizeof(FromCharT) == 4, "Expect UTF-32 string to be in a 32-bit char type.");
260
262 r.reserve(guess_num_code_units<ToCharT>(rhs));
263
264 for (hilet c : rhs) {
265 append_code_point(r, static_cast<uint32_t>(c));
266 }
267
268 return r;
269}
270
271} // namespace detail
272
280template<typename It>
281[[nodiscard]] constexpr std::endian guess_utf16_endianess(It first, It last, std::endian default_guess)
282{
283 static_assert(sizeof(*first) == 1, "Expecting an array of 8-bit characters");
284 hilet num_words = narrow_cast<std::size_t>(std::distance(first, last) / 2);
285
286 if (not num_words) {
287 return default_guess;
288 }
289
290 // Check for BOM.
291 {
292 hilet c0 = static_cast<uint8_t>(*first);
293 hilet c1 = static_cast<uint8_t>(*(first + 1));
294 if (c0 == 0xfe && c1 == 0xff) {
295 return std::endian::big;
296 } else if (c1 == 0xfe and c0 == 0xff) {
297 return std::endian::little;
298 }
299 }
300
301 // Count the nul bytes in high or low byte of the UTF16 string.
302 std::size_t count0 = 0;
303 std::size_t count1 = 0;
304 auto it = first;
305 for (auto i = 0; i != num_words; ++i) {
306 hilet c0 = static_cast<uint8_t>(*(it++));
307 hilet c1 = static_cast<uint8_t>(*(it++));
308
309 if (c0 == 0 and c0 != c1) {
310 ++count0;
311 } else if (c1 == 0 and c0 != c1) {
312 ++count1;
313 }
314 }
315
316 // Check for at least 1/8 ASCII characters.
317 if (count0 == count1) {
318 return default_guess;
319 } else if (count0 > count1 and count0 > (num_words / 8)) {
320 return std::endian::little;
321 } else if (count1 > count0 and count1 > (num_words / 8)) {
322 return std::endian::big;
323 } else {
324 return default_guess;
325 }
326}
327
328template<typename FromChar>
329[[nodiscard]] constexpr std::string to_string(std::basic_string_view<FromChar> rhs) noexcept
330{
331 hi_static_not_implemented();
332}
333
334template<std::same_as<char16_t> FromChar>
335[[nodiscard]] constexpr std::string to_string(std::basic_string_view<FromChar> rhs) noexcept
336{
337 return detail::from_utf16<char>(rhs);
338}
339
340template<std::same_as<char32_t> FromChar>
341[[nodiscard]] constexpr std::string to_string(std::basic_string_view<FromChar> rhs) noexcept
342{
343 return detail::from_utf32<char>(rhs);
344}
345
346template<std::same_as<wchar_t> FromChar>
347[[nodiscard]] constexpr std::string to_string(std::basic_string_view<FromChar> rhs) noexcept
348 requires(sizeof(FromChar) == sizeof(char16_t))
349{
350 return detail::from_utf16<char>(rhs);
351}
352
353template<std::same_as<wchar_t> FromChar>
354[[nodiscard]] constexpr std::string to_string(std::basic_string_view<FromChar> rhs) noexcept
355 requires(sizeof(FromChar) == sizeof(char32_t))
356{
357 return detail::from_utf32<char>(rhs);
358}
359
360template<typename FromChar>
361[[nodiscard]] constexpr std::u16string to_u16string(std::basic_string_view<FromChar> rhs) noexcept
362{
363 hi_static_not_implemented();
364}
365
366template<std::same_as<char> FromChar>
367[[nodiscard]] constexpr std::u16string to_u16string(std::basic_string_view<FromChar> rhs) noexcept
368{
369 return detail::from_utf8<char16_t>(rhs);
370}
371
372template<std::same_as<char32_t> FromChar>
373[[nodiscard]] constexpr std::u16string to_u16string(std::basic_string_view<FromChar> rhs) noexcept
374{
375 return detail::from_utf32<char16_t>(rhs);
376}
377
378template<std::same_as<wchar_t> FromChar>
379[[nodiscard]] constexpr std::u16string to_u16string(std::basic_string_view<FromChar> rhs) noexcept
380 requires(sizeof(FromChar) == sizeof(char16_t))
381{
382 return detail::from_utf16<char16_t>(rhs);
383}
384
385template<std::same_as<wchar_t> FromChar>
386[[nodiscard]] constexpr std::u16string to_u16string(std::basic_string_view<FromChar> rhs) noexcept
387 requires(sizeof(FromChar) == sizeof(char32_t))
388{
389 return detail::from_utf32<char16_t>(rhs);
390}
391
392template<typename FromChar>
393[[nodiscard]] constexpr std::u32string to_u32string(std::basic_string_view<FromChar> rhs) noexcept
394{
395 hi_static_not_implemented();
396}
397
398template<std::same_as<char> FromChar>
399[[nodiscard]] constexpr std::u32string to_u32string(std::basic_string_view<FromChar> rhs) noexcept
400{
401 return detail::from_utf8<char32_t>(rhs);
402}
403
404template<std::same_as<char16_t> FromChar>
405[[nodiscard]] constexpr std::u32string to_u32string(std::basic_string_view<FromChar> rhs) noexcept
406{
407 return detail::from_utf16<char32_t>(rhs);
408}
409
410template<std::same_as<wchar_t> FromChar>
411[[nodiscard]] constexpr std::u32string to_u32string(std::basic_string_view<FromChar> rhs) noexcept
412 requires(sizeof(FromChar) == sizeof(char16_t))
413{
414 return detail::from_utf16<char32_t>(rhs);
415}
416
417template<std::same_as<wchar_t> FromChar>
418[[nodiscard]] constexpr std::u32string to_u32string(std::basic_string_view<FromChar> rhs) noexcept
419 requires(sizeof(FromChar) == sizeof(char32_t))
420{
421 return detail::from_utf32<char32_t>(rhs);
422}
423
424template<typename FromChar>
425[[nodiscard]] constexpr std::wstring to_wstring(std::basic_string_view<FromChar> rhs) noexcept
426{
427 hi_static_not_implemented();
428}
429
430template<std::same_as<char> FromChar>
431[[nodiscard]] constexpr std::wstring to_wstring(std::basic_string_view<FromChar> rhs) noexcept
432{
433 return detail::from_utf8<wchar_t>(rhs);
434}
435
436template<std::same_as<char16_t> FromChar>
437[[nodiscard]] constexpr std::wstring to_wstring(std::basic_string_view<FromChar> rhs) noexcept
438{
439 return detail::from_utf16<wchar_t>(rhs);
440}
441
442template<std::same_as<char32_t> FromChar>
443[[nodiscard]] constexpr std::wstring to_wstring(std::basic_string_view<FromChar> rhs) noexcept
444{
445 return detail::from_utf32<wchar_t>(rhs);
446}
447
448template<typename FromChar>
449[[nodiscard]] constexpr std::string to_string(std::basic_string<FromChar> const &rhs) noexcept
450{
451 return to_string(std::basic_string_view<FromChar>(rhs));
452}
453
454template<typename FromChar>
455[[nodiscard]] constexpr std::u16string to_u16string(std::basic_string<FromChar> const &rhs) noexcept
456{
457 return to_u16string(std::basic_string_view<FromChar>(rhs));
458}
459
460template<typename FromChar>
461[[nodiscard]] constexpr std::u32string to_u32string(std::basic_string<FromChar> const &rhs) noexcept
462{
463 return to_u32string(std::basic_string_view<FromChar>(rhs));
464}
465
466template<typename FromChar>
467[[nodiscard]] std::wstring to_wstring(std::basic_string<FromChar> const &rhs) noexcept
468{
469 return to_wstring(std::basic_string_view<FromChar>(rhs));
470}
471
472template<typename FromChar>
473[[nodiscard]] std::string to_string(FromChar const *rhs) noexcept
474{
475 return to_string(std::basic_string_view<FromChar>(rhs));
476}
477
478template<typename FromChar>
479[[nodiscard]] std::u16string to_u16string(FromChar const *rhs) noexcept
480{
481 return to_u16string(std::basic_string_view<FromChar>(rhs));
482}
483
484template<typename FromChar>
485[[nodiscard]] std::u32string to_u32string(FromChar const *rhs) noexcept
486{
487 return to_u32string(std::basic_string_view<FromChar>(rhs));
488}
489
490template<typename FromChar>
491[[nodiscard]] std::wstring to_wstring(FromChar const *rhs) noexcept
492{
493 return to_wstring(std::basic_string_view<FromChar>(rhs));
494}
495
496} // namespace hi::inline v1
This file includes required definitions.
#define hilet
Invariant should be the default for variables.
Definition required.hpp:23
T begin(T... args)
T distance(T... args)
T end(T... args)
T to_string(T... args)
T to_wstring(T... args)