HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
unicode_normalization.hpp
1// Copyright Take Vos 2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "ucd_decompositions.hpp"
8#include "ucd_compositions.hpp"
9#include "ucd_canonical_combining_classes.hpp"
10#include "unicode_description.hpp"
11#include "../algorithm/module.hpp"
12#include "../utility/utility.hpp"
13#include "../macros.hpp"
14#include <cstdint>
15#include <string>
16#include <string_view>
17
18
19
20namespace hi::inline v1 {
21
25 uint64_t decomposition_mask : 17 = 0;
26
29 uint64_t drop_C0 : 1 = 0;
30
33 uint64_t drop_C1 : 1 = 0;
34
38
42
46
51 char32_t line_separator_character = unicode_LS;
52
57 char32_t paragraph_separator_character = unicode_PS;
58
59 constexpr unicode_normalize_config& add(unicode_decomposition_type type) noexcept
60 {
61 decomposition_mask |= 1_uz << std::to_underlying(type);
62 return *this;
63 }
64
65 [[nodiscard]] constexpr static unicode_normalize_config NFD() noexcept
66 {
67 auto r = unicode_normalize_config();
68 r.add(unicode_decomposition_type::canonical);
69 return r;
70 }
71
72 [[nodiscard]] constexpr static unicode_normalize_config NFC() noexcept
73 {
74 return NFD();
75 }
76
79 [[nodiscard]] constexpr static unicode_normalize_config NFC_PS_noctr() noexcept
80 {
81 auto r = NFC();
82 r.drop_C0 = 1;
83 r.drop_C1 = 1;
84 r.drop = U"\r";
85 r.paragraph_separators = U"\n\v\f\u0085\u2028\u2029";
86 r.paragraph_separator_character = U'\u2029';
87 return r;
88 }
89
92 [[nodiscard]] constexpr static unicode_normalize_config NFC_CRLF_noctr() noexcept
93 {
94 auto r = NFC();
95 r.drop_C0 = 1;
96 r.drop_C1 = 1;
97 r.drop = U"\r";
98 r.paragraph_separators = U"\n\v\f\u0085\u2028\u2029";
99 r.paragraph_separator_character = U'\r';
100 return r;
101 }
102
103 [[nodiscard]] constexpr static unicode_normalize_config NFKD() noexcept
104 {
105 auto r = unicode_normalize_config::NFD();
106 r.add(unicode_decomposition_type::canonical);
107 r.add(unicode_decomposition_type::font);
108 r.add(unicode_decomposition_type::noBreak);
109 r.add(unicode_decomposition_type::initial);
110 r.add(unicode_decomposition_type::medial);
111 r.add(unicode_decomposition_type::_final);
112 r.add(unicode_decomposition_type::isolated);
113 r.add(unicode_decomposition_type::circle);
114 r.add(unicode_decomposition_type::super);
115 r.add(unicode_decomposition_type::sub);
116 r.add(unicode_decomposition_type::fraction);
117 r.add(unicode_decomposition_type::vertical);
118 r.add(unicode_decomposition_type::wide);
119 r.add(unicode_decomposition_type::narrow);
120 r.add(unicode_decomposition_type::small);
121 r.add(unicode_decomposition_type::square);
122 r.add(unicode_decomposition_type::compat);
123 return r;
124 }
125
126 [[nodiscard]] constexpr static unicode_normalize_config NFKC() noexcept
127 {
128 return NFKD();
129 }
130};
131
132namespace detail {
133
134constexpr void unicode_decompose(char32_t code_point, unicode_normalize_config config, std::u32string& r) noexcept
135{
136 for (hilet c : config.line_separators) {
137 if (code_point == c) {
138 r += config.line_separator_character;
139 if (config.line_separator_character == unicode_CR) {
140 r += unicode_LF;
141 }
142 return;
143 }
144 }
145
146 for (hilet c : config.paragraph_separators) {
147 if (code_point == c) {
148 r += config.paragraph_separator_character;
149 if (config.paragraph_separator_character == unicode_CR) {
150 r += unicode_LF;
151 }
152 return;
153 }
154 }
155
156 for (hilet c : config.drop) {
157 if (code_point == c) {
158 return;
159 }
160 }
161
162 if (config.drop_C0 and ((code_point >= U'\u0000' and code_point <= U'\u001f') or code_point == U'\u007f')) {
163 return;
164 }
165
166 if (config.drop_C1 and code_point >= U'\u0080' and code_point <= U'\u009f') {
167 return;
168 }
169
170 hilet decomposition_info = ucd_get_decomposition(code_point);
171 if (decomposition_info.should_decompose(config.decomposition_mask)) {
172 for (hilet c : decomposition_info.decompose()) {
173 unicode_decompose(c, config, r);
174 }
175
176 } else {
177 hilet ccc = ucd_get_canonical_combining_class(code_point);
178 r += code_point | (wide_cast<char32_t>(ccc) << 24);
179 }
180}
181
182constexpr void unicode_decompose(std::u32string_view text, unicode_normalize_config config, std::u32string& r) noexcept
183{
184 for (hilet c : text) {
185 unicode_decompose(c, config, r);
186 }
187}
188
189constexpr void unicode_compose(std::u32string& text) noexcept
190{
191 if (text.size() <= 1) {
192 return;
193 }
194
195 // This algorithm reads using `i`-index and writes using the `j`-index.
196 // When compositing characters, `j` will lag behind.
197 auto i = 0_uz;
198 auto j = 0_uz;
199 while (i != text.size()) {
200 hilet code_unit = text[i++];
201 hilet code_point = code_unit & 0xff'ffff;
202 hilet combining_class = code_unit >> 24;
203 hilet first_is_starter = combining_class == 0;
204
205 if (code_unit == 0xffff'ffff) {
206 // Snuffed out by compositing in this algorithm.
207 // We continue going forward looking for code-points.
208
209 } else if (first_is_starter) {
210 // Try composing.
211 auto first_code_point = code_point;
212 char32_t previous_combining_class = 0;
213 for (auto k = i; k != text.size(); ++k) {
214 hilet second_code_unit = text[k];
215 hilet second_code_point = second_code_unit & 0xff'ffff;
216 hilet second_combining_class = second_code_unit >> 24;
217
218 hilet blocking_pair = previous_combining_class != 0 and previous_combining_class >= second_combining_class;
219 hilet second_is_starter = second_combining_class == 0;
220
221 hilet composed_code_point = ucd_get_composition(first_code_point, second_code_point);
222 if (composed_code_point and not blocking_pair) {
223 // Found a composition.
224 first_code_point = *composed_code_point;
225 // The canonical combined DecompositionOrder is always zero.
226 previous_combining_class = 0;
227 // Snuff out the code-unit.
228 text[k] = 0xffff'ffff;
229
230 } else if (second_is_starter) {
231 // End after failing to compose with the next start-character.
232 break;
233
234 } else {
235 // The start character is not composing with this composingC.
236 previous_combining_class = second_combining_class;
237 }
238 }
239 // Add the new combined character to the text.
240 text[j++] = first_code_point;
241
242 } else {
243 // Unable to compose this character.
244 text[j++] = code_point;
245 }
246 }
247
248 text.resize(j);
249}
250
251constexpr void unicode_reorder(std::u32string& text) noexcept
252{
253 constexpr auto ccc_less = [](char32_t a, char32_t b) {
254 return (a >> 24) < (b >> 24);
255 };
256
257 hilet first = text.begin();
258 hilet last = text.end();
259
260 if (first == last) {
261 return;
262 }
263
264 auto cluster_it = first;
265 for (auto it = cluster_it + 1; it != last; ++it) {
266 if (*it <= 0xff'ffff) {
267 std::stable_sort(cluster_it, it, ccc_less);
268 cluster_it = it;
269 }
270 }
271
272 std::stable_sort(cluster_it, last, ccc_less);
273}
274
275constexpr void unicode_clean(std::u32string& text) noexcept
276{
277 // clean up the text by removing the upper bits.
278 for (auto& codePoint : text) {
279 codePoint &= 0x1f'ffff;
280 }
281}
282
283} // namespace detail
284
290[[nodiscard]] constexpr std::u32string
291unicode_decompose(std::u32string_view text, unicode_normalize_config config = unicode_normalize_config::NFD()) noexcept
292{
293 auto r = std::u32string{};
294 detail::unicode_decompose(text, config, r);
295 detail::unicode_reorder(r);
296 detail::unicode_clean(r);
297 return r;
298}
299
305[[nodiscard]] constexpr std::u32string
306unicode_normalize(std::u32string_view text, unicode_normalize_config config = unicode_normalize_config::NFC()) noexcept
307{
308 auto r = std::u32string{};
309 detail::unicode_decompose(text, config, r);
310 detail::unicode_reorder(r);
311 detail::unicode_compose(r);
312 detail::unicode_clean(r);
313 return r;
314}
315
321template<std::input_iterator It, std::sentinel_for<It> ItEnd>
322[[nodiscard]] constexpr bool unicode_is_NFC_grapheme(It it, ItEnd last) noexcept
323{
324 if (it == last) {
325 // Needs to have at least one code-point.
326 return false;
327 }
328
329 if (std::distance(it, last) > 31) {
330 // A maximum 30 marks is allowed after the starter.
331 return false;
332 }
333
334 if (ucd_get_canonical_combining_class(*it++) != 0) {
335 // The first code-point must be a starter (CCC == 0).
336 return false;
337 }
338
339 // Check if each consequtive code-point is a mark (CCC != 0).
340 // And that the CCC is ordered by numeric value.
341 auto max_ccc = uint8_t{1};
342 for (; it != last; ++it) {
343 hilet ccc = ucd_get_canonical_combining_class(*it);
344 if (ccc < max_ccc) {
345 return false;
346 }
347 max_ccc = ccc;
348
349 // XXX Needs check if code-point is allowed in NFC.
350
351 }
352
353 // All tests pass.
354 return true;
355}
356
357} // namespace hi::inline v1
DOXYGEN BUG.
Definition algorithm.hpp:16
constexpr bool unicode_is_NFC_grapheme(It it, ItEnd last) noexcept
Check if the string of code-points is a single grapheme in NFC normal form.
Definition unicode_normalization.hpp:322
constexpr std::u32string unicode_normalize(std::u32string_view text, unicode_normalize_config config=unicode_normalize_config::NFC()) noexcept
Convert text to a Unicode composed normal form.
Definition unicode_normalization.hpp:306
constexpr ucd_decomposition_info ucd_get_decomposition(char32_t code_point) noexcept
Get the decomposition info of a code-point.
Definition ucd_decompositions.hpp:4800
constexpr Out narrow_cast(In const &rhs) noexcept
Cast numeric values without loss of precision.
Definition cast.hpp:377
constexpr std::optional< char32_t > ucd_get_composition(char32_t cp1, char32_t cp2) noexcept
Get the composition info of two code-points.
Definition ucd_compositions.hpp:2338
Definition unicode_normalization.hpp:22
static constexpr unicode_normalize_config NFC_PS_noctr() noexcept
Use NFC normalization, convert all line-feed-like characters to PS.
Definition unicode_normalization.hpp:79
uint64_t decomposition_mask
The types of decompositions, that should be used when decomposing.
Definition unicode_normalization.hpp:25
std::u32string drop
Code-points to be dropped.
Definition unicode_normalization.hpp:45
uint64_t drop_C1
Drop the C1 control characters.
Definition unicode_normalization.hpp:33
std::u32string paragraph_separators
Code-points to be treated as line-separators.
Definition unicode_normalization.hpp:41
uint64_t drop_C0
Drop the C0 control characters.
Definition unicode_normalization.hpp:29
static constexpr unicode_normalize_config NFC_CRLF_noctr() noexcept
Use NFC normalization, convert all line-feed-like characters to CR-LF.
Definition unicode_normalization.hpp:92
std::u32string line_separators
Code-points to be treated as line-separators.
Definition unicode_normalization.hpp:37
T distance(T... args)
T stable_sort(T... args)