HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
unicode_normalization.hpp
1// Copyright Take Vos 2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "ucd_decompositions.hpp"
8#include "ucd_compositions.hpp"
9#include "ucd_canonical_combining_classes.hpp"
10#include "unicode_description.hpp"
11#include "../algorithm/algorithm.hpp"
12#include "../utility/utility.hpp"
13#include "../macros.hpp"
14#include <cstdint>
15#include <string>
16#include <string_view>
17#include <algorithm>
18
19hi_export_module(hikogui.unicode.unicode_normalization);
20
21
22hi_export namespace hi::inline v1 {
23
27 uint64_t decomposition_mask : 17 = 0;
28
31 uint64_t drop_C0 : 1 = 0;
32
35 uint64_t drop_C1 : 1 = 0;
36
40
44
48
53 char32_t line_separator_character = unicode_LS;
54
59 char32_t paragraph_separator_character = unicode_PS;
60
61 constexpr unicode_normalize_config& add(unicode_decomposition_type type) noexcept
62 {
63 decomposition_mask |= 1_uz << std::to_underlying(type);
64 return *this;
65 }
66
67 [[nodiscard]] constexpr static unicode_normalize_config NFD() noexcept
68 {
69 auto r = unicode_normalize_config();
70 r.add(unicode_decomposition_type::canonical);
71 return r;
72 }
73
74 [[nodiscard]] constexpr static unicode_normalize_config NFC() noexcept
75 {
76 return NFD();
77 }
78
81 [[nodiscard]] constexpr static unicode_normalize_config NFC_PS_noctr() noexcept
82 {
83 auto r = NFC();
84 r.drop_C0 = 1;
85 r.drop_C1 = 1;
86 r.drop = U"\r";
87 r.paragraph_separators = U"\n\v\f\u0085\u2028\u2029";
88 r.paragraph_separator_character = U'\u2029';
89 return r;
90 }
91
94 [[nodiscard]] constexpr static unicode_normalize_config NFC_CRLF_noctr() noexcept
95 {
96 auto r = NFC();
97 r.drop_C0 = 1;
98 r.drop_C1 = 1;
99 r.drop = U"\r";
100 r.paragraph_separators = U"\n\v\f\u0085\u2028\u2029";
101 r.paragraph_separator_character = U'\r';
102 return r;
103 }
104
105 [[nodiscard]] constexpr static unicode_normalize_config NFKD() noexcept
106 {
107 auto r = unicode_normalize_config::NFD();
108 r.add(unicode_decomposition_type::canonical);
109 r.add(unicode_decomposition_type::font);
110 r.add(unicode_decomposition_type::noBreak);
111 r.add(unicode_decomposition_type::initial);
112 r.add(unicode_decomposition_type::medial);
113 r.add(unicode_decomposition_type::_final);
114 r.add(unicode_decomposition_type::isolated);
115 r.add(unicode_decomposition_type::circle);
116 r.add(unicode_decomposition_type::super);
117 r.add(unicode_decomposition_type::sub);
118 r.add(unicode_decomposition_type::fraction);
119 r.add(unicode_decomposition_type::vertical);
120 r.add(unicode_decomposition_type::wide);
121 r.add(unicode_decomposition_type::narrow);
122 r.add(unicode_decomposition_type::small);
123 r.add(unicode_decomposition_type::square);
124 r.add(unicode_decomposition_type::compat);
125 return r;
126 }
127
128 [[nodiscard]] constexpr static unicode_normalize_config NFKC() noexcept
129 {
130 return NFKD();
131 }
132};
133
134namespace detail {
135
136constexpr void unicode_decompose(char32_t code_point, unicode_normalize_config config, std::u32string& r) noexcept
137{
138 for (auto const c : config.line_separators) {
139 if (code_point == c) {
140 r += config.line_separator_character;
141 if (config.line_separator_character == unicode_CR) {
142 r += unicode_LF;
143 }
144 return;
145 }
146 }
147
148 for (auto const c : config.paragraph_separators) {
149 if (code_point == c) {
150 r += config.paragraph_separator_character;
151 if (config.paragraph_separator_character == unicode_CR) {
152 r += unicode_LF;
153 }
154 return;
155 }
156 }
157
158 for (auto const c : config.drop) {
159 if (code_point == c) {
160 return;
161 }
162 }
163
164 if (config.drop_C0 and ((code_point >= U'\u0000' and code_point <= U'\u001f') or code_point == U'\u007f')) {
165 return;
166 }
167
168 if (config.drop_C1 and code_point >= U'\u0080' and code_point <= U'\u009f') {
169 return;
170 }
171
172 auto const decomposition_info = ucd_get_decomposition(code_point);
173 if (decomposition_info.should_decompose(config.decomposition_mask)) {
174 for (auto const c : decomposition_info.decompose()) {
175 unicode_decompose(c, config, r);
176 }
177
178 } else {
179 auto const ccc = ucd_get_canonical_combining_class(code_point);
180 r += code_point | (wide_cast<char32_t>(ccc) << 24);
181 }
182}
183
184constexpr void unicode_decompose(std::u32string_view text, unicode_normalize_config config, std::u32string& r) noexcept
185{
186 for (auto const c : text) {
187 unicode_decompose(c, config, r);
188 }
189}
190
191constexpr void unicode_compose(std::u32string& text) noexcept
192{
193 if (text.size() <= 1) {
194 return;
195 }
196
197 // This algorithm reads using `i`-index and writes using the `j`-index.
198 // When compositing characters, `j` will lag behind.
199 auto i = 0_uz;
200 auto j = 0_uz;
201 while (i != text.size()) {
202 auto const code_unit = text[i++];
203 auto const code_point = code_unit & 0xff'ffff;
204 auto const combining_class = code_unit >> 24;
205 auto const first_is_starter = combining_class == 0;
206
207 if (code_unit == 0xffff'ffff) {
208 // Snuffed out by compositing in this algorithm.
209 // We continue going forward looking for code-points.
210
211 } else if (first_is_starter) {
212 // Try composing.
213 auto first_code_point = code_point;
214 char32_t previous_combining_class = 0;
215 for (auto k = i; k != text.size(); ++k) {
216 auto const second_code_unit = text[k];
217 auto const second_code_point = second_code_unit & 0xff'ffff;
218 auto const second_combining_class = second_code_unit >> 24;
219
220 auto const blocking_pair = previous_combining_class != 0 and previous_combining_class >= second_combining_class;
221 auto const second_is_starter = second_combining_class == 0;
222
223 auto const composed_code_point = ucd_get_composition(first_code_point, second_code_point);
224 if (composed_code_point and not blocking_pair) {
225 // Found a composition.
226 first_code_point = *composed_code_point;
227 // The canonical combined DecompositionOrder is always zero.
228 previous_combining_class = 0;
229 // Snuff out the code-unit.
230 text[k] = 0xffff'ffff;
231
232 } else if (second_is_starter) {
233 // End after failing to compose with the next start-character.
234 break;
235
236 } else {
237 // The start character is not composing with this composingC.
238 previous_combining_class = second_combining_class;
239 }
240 }
241 // Add the new combined character to the text.
242 text[j++] = first_code_point;
243
244 } else {
245 // Unable to compose this character.
246 text[j++] = code_point;
247 }
248 }
249
250 text.resize(j);
251}
252
253constexpr void unicode_reorder(std::u32string& text) noexcept
254{
255 constexpr auto ccc_less = [](char32_t a, char32_t b) {
256 return (a >> 24) < (b >> 24);
257 };
258
259 auto const first = text.begin();
260 auto const last = text.end();
261
262 if (first == last) {
263 return;
264 }
265
266 auto cluster_it = first;
267 for (auto it = cluster_it + 1; it != last; ++it) {
268 if (*it <= 0xff'ffff) {
269 std::stable_sort(cluster_it, it, ccc_less);
270 cluster_it = it;
271 }
272 }
273
274 std::stable_sort(cluster_it, last, ccc_less);
275}
276
277constexpr void unicode_clean(std::u32string& text) noexcept
278{
279 // clean up the text by removing the upper bits.
280 for (auto& codePoint : text) {
281 codePoint &= 0x1f'ffff;
282 }
283}
284
285} // namespace detail
286
292[[nodiscard]] constexpr std::u32string
293unicode_decompose(std::u32string_view text, unicode_normalize_config config = unicode_normalize_config::NFD()) noexcept
294{
295 auto r = std::u32string{};
296 detail::unicode_decompose(text, config, r);
297 detail::unicode_reorder(r);
298 detail::unicode_clean(r);
299 return r;
300}
301
307[[nodiscard]] constexpr std::u32string
308unicode_normalize(std::u32string_view text, unicode_normalize_config config = unicode_normalize_config::NFC()) noexcept
309{
310 auto r = std::u32string{};
311 detail::unicode_decompose(text, config, r);
312 detail::unicode_reorder(r);
313 detail::unicode_compose(r);
314 detail::unicode_clean(r);
315 return r;
316}
317
323template<std::input_iterator It, std::sentinel_for<It> ItEnd>
324[[nodiscard]] constexpr bool unicode_is_NFC_grapheme(It it, ItEnd last) noexcept
325{
326 if (it == last) {
327 // Needs to have at least one code-point.
328 return false;
329 }
330
331 if (std::distance(it, last) > 31) {
332 // A maximum 30 marks is allowed after the starter.
333 return false;
334 }
335
336 if (ucd_get_canonical_combining_class(*it++) != 0) {
337 // The first code-point must be a starter (CCC == 0).
338 return false;
339 }
340
341 // Check if each consequtive code-point is a mark (CCC != 0).
342 // And that the CCC is ordered by numeric value.
343 auto max_ccc = uint8_t{1};
344 for (; it != last; ++it) {
345 auto const ccc = ucd_get_canonical_combining_class(*it);
346 if (ccc < max_ccc) {
347 return false;
348 }
349 max_ccc = ccc;
350
351 // XXX Needs check if code-point is allowed in NFC.
352
353 }
354
355 // All tests pass.
356 return true;
357}
358
359} // namespace hi::inline v1
constexpr ucd_decomposition_info ucd_get_decomposition(char32_t code_point) noexcept
Get the decomposition info of a code-point.
Definition ucd_decompositions.hpp:4803
constexpr std::optional< char32_t > ucd_get_composition(char32_t cp1, char32_t cp2) noexcept
Get the composition info of two code-points.
Definition ucd_compositions.hpp:2341
DOXYGEN BUG.
Definition algorithm_misc.hpp:20
constexpr bool unicode_is_NFC_grapheme(It it, ItEnd last) noexcept
Check if the string of code-points is a single grapheme in NFC normal form.
Definition unicode_normalization.hpp:324
constexpr std::u32string unicode_normalize(std::u32string_view text, unicode_normalize_config config=unicode_normalize_config::NFC()) noexcept
Convert text to a Unicode composed normal form.
Definition unicode_normalization.hpp:308
Definition unicode_normalization.hpp:24
static constexpr unicode_normalize_config NFC_PS_noctr() noexcept
Use NFC normalization, convert all line-feed-like characters to PS.
Definition unicode_normalization.hpp:81
uint64_t decomposition_mask
The types of decompositions, that should be used when decomposing.
Definition unicode_normalization.hpp:27
std::u32string drop
Code-points to be dropped.
Definition unicode_normalization.hpp:47
uint64_t drop_C1
Drop the C1 control characters.
Definition unicode_normalization.hpp:35
std::u32string paragraph_separators
Code-points to be treated as line-separators.
Definition unicode_normalization.hpp:43
uint64_t drop_C0
Drop the C0 control characters.
Definition unicode_normalization.hpp:31
static constexpr unicode_normalize_config NFC_CRLF_noctr() noexcept
Use NFC normalization, convert all line-feed-like characters to CR-LF.
Definition unicode_normalization.hpp:94
std::u32string line_separators
Code-points to be treated as line-separators.
Definition unicode_normalization.hpp:39
T distance(T... args)
T stable_sort(T... args)