HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
unicode_normalization.hpp
1// Copyright Take Vos 2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "ucd_decompositions.hpp"
8#include "ucd_compositions.hpp"
9#include "ucd_canonical_combining_classes.hpp"
10#include "unicode_description.hpp"
11#include "../utility/module.hpp"
12#include "../algorithm.hpp"
13#include <cstdint>
14#include <string>
15#include <string_view>
16
17namespace hi::inline v1 {
18
22 uint64_t decomposition_mask : 17 = 0;
23
26 uint64_t drop_C0 : 1 = 0;
27
30 uint64_t drop_C1 : 1 = 0;
31
35
39
43
48 char32_t line_separator_character = unicode_LS;
49
54 char32_t paragraph_separator_character = unicode_PS;
55
56 constexpr unicode_normalize_config& add(unicode_decomposition_type type) noexcept
57 {
58 decomposition_mask |= 1_uz << to_underlying(type);
59 return *this;
60 }
61
62 [[nodiscard]] constexpr static unicode_normalize_config NFD() noexcept
63 {
64 auto r = unicode_normalize_config();
65 r.add(unicode_decomposition_type::canonical);
66 return r;
67 }
68
69 [[nodiscard]] constexpr static unicode_normalize_config NFC() noexcept
70 {
71 return NFD();
72 }
73
76 [[nodiscard]] constexpr static unicode_normalize_config NFC_PS_noctr() noexcept
77 {
78 auto r = NFC();
79 r.drop_C0 = 1;
80 r.drop_C1 = 1;
81 r.drop = U"\r";
82 r.paragraph_separators = U"\n\v\f\u0085\u2028\u2029";
83 r.paragraph_separator_character = U'\u2029';
84 return r;
85 }
86
89 [[nodiscard]] constexpr static unicode_normalize_config NFC_CRLF_noctr() noexcept
90 {
91 auto r = NFC();
92 r.drop_C0 = 1;
93 r.drop_C1 = 1;
94 r.drop = U"\r";
95 r.paragraph_separators = U"\n\v\f\u0085\u2028\u2029";
96 r.paragraph_separator_character = U'\r';
97 return r;
98 }
99
100 [[nodiscard]] constexpr static unicode_normalize_config NFKD() noexcept
101 {
102 auto r = unicode_normalize_config::NFD();
103 r.add(unicode_decomposition_type::canonical);
104 r.add(unicode_decomposition_type::font);
105 r.add(unicode_decomposition_type::noBreak);
106 r.add(unicode_decomposition_type::initial);
107 r.add(unicode_decomposition_type::medial);
108 r.add(unicode_decomposition_type::_final);
109 r.add(unicode_decomposition_type::isolated);
110 r.add(unicode_decomposition_type::circle);
111 r.add(unicode_decomposition_type::super);
112 r.add(unicode_decomposition_type::sub);
113 r.add(unicode_decomposition_type::fraction);
114 r.add(unicode_decomposition_type::vertical);
115 r.add(unicode_decomposition_type::wide);
116 r.add(unicode_decomposition_type::narrow);
117 r.add(unicode_decomposition_type::small);
118 r.add(unicode_decomposition_type::square);
119 r.add(unicode_decomposition_type::compat);
120 return r;
121 }
122
123 [[nodiscard]] constexpr static unicode_normalize_config NFKC() noexcept
124 {
125 return NFKD();
126 }
127};
128
129namespace detail {
130
131constexpr void unicode_decompose(char32_t code_point, unicode_normalize_config config, std::u32string& r) noexcept
132{
133 for (hilet c : config.line_separators) {
134 if (code_point == c) {
135 r += config.line_separator_character;
136 if (config.line_separator_character == unicode_CR) {
137 r += unicode_LF;
138 }
139 return;
140 }
141 }
142
143 for (hilet c : config.paragraph_separators) {
144 if (code_point == c) {
145 r += config.paragraph_separator_character;
146 if (config.paragraph_separator_character == unicode_CR) {
147 r += unicode_LF;
148 }
149 return;
150 }
151 }
152
153 for (hilet c : config.drop) {
154 if (code_point == c) {
155 return;
156 }
157 }
158
159 if (config.drop_C0 and ((code_point >= U'\u0000' and code_point <= U'\u001f') or code_point == U'\u007f')) {
160 return;
161 }
162
163 if (config.drop_C1 and code_point >= U'\u0080' and code_point <= U'\u009f') {
164 return;
165 }
166
167 hilet decomposition_info = ucd_get_decomposition(code_point);
168 if (decomposition_info.should_decompose(config.decomposition_mask)) {
169 for (hilet c : decomposition_info.decompose()) {
170 unicode_decompose(c, config, r);
171 }
172
173 } else {
174 hilet ccc = ucd_get_canonical_combining_class(code_point);
175 r += code_point | (wide_cast<char32_t>(ccc) << 24);
176 }
177}
178
179constexpr void unicode_decompose(std::u32string_view text, unicode_normalize_config config, std::u32string& r) noexcept
180{
181 for (hilet c : text) {
182 unicode_decompose(c, config, r);
183 }
184}
185
186constexpr void unicode_compose(std::u32string& text) noexcept
187{
188 if (text.size() <= 1) {
189 return;
190 }
191
192 // This algorithm reads using `i`-index and writes using the `j`-index.
193 // When compositing characters, `j` will lag behind.
194 auto i = 0_uz;
195 auto j = 0_uz;
196 while (i != text.size()) {
197 hilet code_unit = text[i++];
198 hilet code_point = code_unit & 0xff'ffff;
199 hilet combining_class = code_unit >> 24;
200 hilet first_is_starter = combining_class == 0;
201
202 if (code_unit == 0xffff'ffff) {
203 // Snuffed out by compositing in this algorithm.
204 // We continue going forward looking for code-points.
205
206 } else if (first_is_starter) {
207 // Try composing.
208 auto first_code_point = code_point;
209 char32_t previous_combining_class = 0;
210 for (auto k = i; k != text.size(); ++k) {
211 hilet second_code_unit = text[k];
212 hilet second_code_point = second_code_unit & 0xff'ffff;
213 hilet second_combining_class = second_code_unit >> 24;
214
215 hilet blocking_pair = previous_combining_class != 0 and previous_combining_class >= second_combining_class;
216 hilet second_is_starter = second_combining_class == 0;
217
218 hilet composed_code_point = ucd_get_composition(first_code_point, second_code_point);
219 if (composed_code_point and not blocking_pair) {
220 // Found a composition.
221 first_code_point = *composed_code_point;
222 // The canonical combined DecompositionOrder is always zero.
223 previous_combining_class = 0;
224 // Snuff out the code-unit.
225 text[k] = 0xffff'ffff;
226
227 } else if (second_is_starter) {
228 // End after failing to compose with the next start-character.
229 break;
230
231 } else {
232 // The start character is not composing with this composingC.
233 previous_combining_class = second_combining_class;
234 }
235 }
236 // Add the new combined character to the text.
237 text[j++] = first_code_point;
238
239 } else {
240 // Unable to compose this character.
241 text[j++] = code_point;
242 }
243 }
244
245 text.resize(j);
246}
247
248constexpr void unicode_reorder(std::u32string& text) noexcept
249{
250 constexpr auto ccc_less = [](char32_t a, char32_t b) {
251 return (a >> 24) < (b >> 24);
252 };
253
254 hilet first = text.begin();
255 hilet last = text.end();
256
257 if (first == last) {
258 return;
259 }
260
261 auto cluster_it = first;
262 for (auto it = cluster_it + 1; it != last; ++it) {
263 if (*it <= 0xff'ffff) {
264 std::stable_sort(cluster_it, it, ccc_less);
265 cluster_it = it;
266 }
267 }
268
269 std::stable_sort(cluster_it, last, ccc_less);
270}
271
272constexpr void unicode_clean(std::u32string& text) noexcept
273{
274 // clean up the text by removing the upper bits.
275 for (auto& codePoint : text) {
276 codePoint &= 0x1f'ffff;
277 }
278}
279
280} // namespace detail
281
287[[nodiscard]] constexpr std::u32string
288unicode_decompose(std::u32string_view text, unicode_normalize_config config = unicode_normalize_config::NFD()) noexcept
289{
290 auto r = std::u32string{};
291 detail::unicode_decompose(text, config, r);
292 detail::unicode_reorder(r);
293 detail::unicode_clean(r);
294 return r;
295}
296
302[[nodiscard]] constexpr std::u32string
303unicode_normalize(std::u32string_view text, unicode_normalize_config config = unicode_normalize_config::NFC()) noexcept
304{
305 auto r = std::u32string{};
306 detail::unicode_decompose(text, config, r);
307 detail::unicode_reorder(r);
308 detail::unicode_compose(r);
309 detail::unicode_clean(r);
310 return r;
311}
312
313} // namespace hi::inline v1
#define hilet
Invariant should be the default for variables.
Definition utility.hpp:23
DOXYGEN BUG.
Definition algorithm.hpp:13
constexpr std::u32string unicode_normalize(std::u32string_view text, unicode_normalize_config config=unicode_normalize_config::NFC()) noexcept
Convert text to a Unicode composed normal form.
Definition unicode_normalization.hpp:303
constexpr ucd_decomposition_info ucd_get_decomposition(char32_t code_point) noexcept
Get the decomposition info of a code-point.
Definition ucd_decompositions.hpp:4798
constexpr std::optional< char32_t > ucd_get_composition(char32_t cp1, char32_t cp2) noexcept
Get the composition info of two code-points.
Definition ucd_compositions.hpp:2336
Definition unicode_normalization.hpp:19
static constexpr unicode_normalize_config NFC_PS_noctr() noexcept
Use NFC normalization, convert all line-feed-like characters to PS.
Definition unicode_normalization.hpp:76
uint64_t decomposition_mask
The types of decompositions, that should be used when decomposing.
Definition unicode_normalization.hpp:22
std::u32string drop
Code-points to be dropped.
Definition unicode_normalization.hpp:42
uint64_t drop_C1
Drop the C1 control characters.
Definition unicode_normalization.hpp:30
std::u32string paragraph_separators
Code-points to be treated as line-separators.
Definition unicode_normalization.hpp:38
uint64_t drop_C0
Drop the C0 control characters.
Definition unicode_normalization.hpp:26
static constexpr unicode_normalize_config NFC_CRLF_noctr() noexcept
Use NFC normalization, convert all line-feed-like characters to CR-LF.
Definition unicode_normalization.hpp:89
std::u32string line_separators
Code-points to be treated as line-separators.
Definition unicode_normalization.hpp:34
T begin(T... args)
T end(T... args)
T resize(T... args)
T size(T... args)
T stable_sort(T... args)