HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
utf_8.hpp
Go to the documentation of this file.
1// Copyright Take Vos 2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
9#pragma once
10
11#include "char_converter.hpp"
12#include "cp_1252.hpp"
13#include <bit>
14
15hi_warning_push();
16// C26490: Don't use reinterpret_cast.
17// Needed for SIMD intrinsics.
18hi_warning_ignore_msvc(26490);
19
20namespace hi { inline namespace v1 {
21
25template<>
26struct char_map<"utf-8"> {
27 using char_type = char;
28 using fallback_encoder_type = char_map<"cp-1252">;
29 using fallback_char_type = fallback_encoder_type::char_type;
30
31 [[nodiscard]] constexpr std::endian guess_endian(void const *ptr, size_t size, std::endian endian) const noexcept
32 {
33 return std::endian::native;
34 }
35
36 template<typename It, typename EndIt>
37 [[nodiscard]] constexpr std::pair<char32_t, bool> read_fallback(It& it, EndIt last) const noexcept
38 {
39 hilet[code_point, valid] = fallback_encoder_type{}.read(it, last);
40 return {code_point, false};
41 }
42
43 template<typename It, typename EndIt>
44 [[nodiscard]] constexpr std::pair<char32_t, bool> read(It& it, EndIt last) const noexcept
45 {
46 hi_axiom(it != last);
47
48 auto cu = *it++;
49 if (not to_bool(cu & 0x80)) [[likely]] {
50 // ASCII character.
51 return {char_cast<char32_t>(cu), true};
52
53 } else if (it == last or (cu & 0xc0) == 0x80) [[unlikely]] {
54 // A non-ASCII character at the end of string.
55 // or an unexpected continuation code-unit should be treated as CP-1252.
56 --it;
57 return read_fallback(it, last);
58
59 } else {
60 hilet length = narrow_cast<uint8_t>(std::countl_one(char_cast<uint8_t>(cu)));
61 hi_axiom(length >= 2);
62
63 // First part of the code-point.
64 auto cp = char_cast<char32_t>(cu & (0x7f >> length));
65
66 // Read the first continuation code-unit which is always here.
67 cu = *it++;
68 cp <<= 6;
69 cp |= cu & 0x3f;
70 if ((cu & 0xc0) != 0x80) [[unlikely]] {
71 // If the second code-unit is not a UTF-8 continuation character, treat the first
72 // code-unit as if it was CP-1252.
73 it -= 2;
74 return read_fallback(it, last);
75 }
76
77 // After we read the first two code-units how many more to do.
78 auto todo = length - 2;
79 if (todo > std::distance(it, last)) [[unlikely]] {
80 // If there is a start and a continuation code-unit in a row we consider this to be UTF-8 encoded.
81 // So at this point any errors are replaced with 0xfffd.
82 it = last;
83 return {0xfffd, false};
84 }
85
86 while (todo--) {
87 cu = *it++;
88 cp <<= 6;
89 cp |= cu & 0x3f;
90 if ((cu & 0xc0) != 0x80) [[unlikely]] {
91 // Unexpected end of sequence.
92 --it;
93 return {0xfffd, false};
94 }
95 }
96
97 auto valid = true;
98 // Valid range.
99 valid &= cp < 0x11'0000;
100 // Not a surrogate.
101 valid &= cp < 0xd800 or cp >= 0xe000;
102 // Not overlong encoded.
103 valid &= length == narrow_cast<uint8_t>((cp > 0x7f) + (cp > 0x7ff) + (cp > 0xffff) + 1);
104 if (not valid) [[unlikely]] {
105 return {0xfffd, false};
106 }
107 return {cp, true};
108 }
109 }
110
111 [[nodiscard]] constexpr std::pair<uint8_t, bool> size(char32_t code_point) const noexcept
112 {
113 hi_axiom(code_point < 0x11'0000);
114 hi_axiom(not(code_point >= 0xd800 and code_point < 0xe000));
115
116 return {narrow_cast<uint8_t>((code_point > 0x7f) + (code_point > 0x7ff) + (code_point > 0xffff) + 1), true};
117 }
118
119 template<typename It>
120 constexpr void write(char32_t code_point, It& dst) const noexcept
121 {
122 hi_axiom(code_point < 0x11'0000);
123 hi_axiom(not(code_point >= 0xd800 and code_point < 0xe000));
124
125 hilet length = truncate<uint8_t>((code_point > 0x7f) + (code_point > 0x7ff) + (code_point > 0xffff));
126 if (auto i = length) {
127 do {
128 dst[i] = truncate<char8_t>((code_point & 0x3f) | 0x80);
129 code_point >>= 6;
130 } while (--i);
131
132 code_point |= 0x780 >> length;
133 }
134 dst[0] = truncate<char8_t>(code_point);
135 dst += length + 1_uz;
136 }
137
138#if defined(HI_HAS_SSE2)
139 template<typename It>
140 hi_force_inline __m128i read_ascii_chunk16(It it) const noexcept
141 {
142 return _mm_loadu_si128(reinterpret_cast<__m128i const *>(std::addressof(*it)));
143 }
144
145 template<typename It>
146 hi_force_inline void write_ascii_chunk16(__m128i chunk, It dst) const noexcept
147 {
148 _mm_storeu_si128(reinterpret_cast<__m128i *>(std::addressof(*dst)), chunk);
149 }
150#endif
151};
152
153}} // namespace hi::v1
154
155hi_warning_pop();
Definition of the CP-1252 / Windows-1252 character map.
Definition of the char_converter<From,To> functor.
#define hi_axiom(expression,...)
Specify an axiom; an expression that is true.
Definition assert.hpp:238
#define hilet
Invariant should be the default for variables.
Definition utility.hpp:23
@ read
Allow read access to a file.
@ write
Allow write access to a file.
DOXYGEN BUG.
Definition algorithm.hpp:13
geometry/margins.hpp
Definition cache.hpp:11
Character encoder/decoder template.
Definition char_converter.hpp:83
T addressof(T... args)
T distance(T... args)