7#include "ucd_decompositions.hpp"
8#include "ucd_compositions.hpp"
9#include "ucd_canonical_combining_classes.hpp"
10#include "unicode_description.hpp"
11#include "../algorithm/algorithm.hpp"
12#include "../utility/utility.hpp"
13#include "../macros.hpp"
19hi_export_module(hikogui.unicode.unicode_normalization);
22hi_export
namespace hi::inline
v1 {
53 char32_t line_separator_character = unicode_LS;
59 char32_t paragraph_separator_character = unicode_PS;
63 decomposition_mask |= 1_uz << std::to_underlying(type);
70 r.add(unicode_decomposition_type::canonical);
74 [[nodiscard]]
constexpr static unicode_normalize_config NFC() noexcept
87 r.paragraph_separators = U
"\n\v\f\u0085\u2028\u2029";
88 r.paragraph_separator_character = U
'\u2029';
100 r.paragraph_separators = U
"\n\v\f\u0085\u2028\u2029";
101 r.paragraph_separator_character = U
'\r';
107 auto r = unicode_normalize_config::NFD();
108 r.add(unicode_decomposition_type::canonical);
109 r.add(unicode_decomposition_type::font);
110 r.add(unicode_decomposition_type::noBreak);
111 r.add(unicode_decomposition_type::initial);
112 r.add(unicode_decomposition_type::medial);
113 r.add(unicode_decomposition_type::_final);
114 r.add(unicode_decomposition_type::isolated);
115 r.add(unicode_decomposition_type::circle);
116 r.add(unicode_decomposition_type::super);
117 r.add(unicode_decomposition_type::sub);
118 r.add(unicode_decomposition_type::fraction);
119 r.add(unicode_decomposition_type::vertical);
120 r.add(unicode_decomposition_type::wide);
121 r.add(unicode_decomposition_type::narrow);
122 r.add(unicode_decomposition_type::small);
123 r.add(unicode_decomposition_type::square);
124 r.add(unicode_decomposition_type::compat);
128 [[nodiscard]]
constexpr static unicode_normalize_config NFKC() noexcept
136constexpr void unicode_decompose(
char32_t code_point, unicode_normalize_config config,
std::u32string& r)
noexcept
138 for (
auto const c : config.line_separators) {
139 if (code_point == c) {
140 r += config.line_separator_character;
141 if (config.line_separator_character == unicode_CR) {
148 for (
auto const c : config.paragraph_separators) {
149 if (code_point == c) {
150 r += config.paragraph_separator_character;
151 if (config.paragraph_separator_character == unicode_CR) {
158 for (
auto const c : config.drop) {
159 if (code_point == c) {
164 if (config.drop_C0 and ((code_point >= U
'\u0000' and code_point <= U
'\u001f') or code_point == U
'\u007f')) {
168 if (config.drop_C1 and code_point >= U
'\u0080' and code_point <= U
'\u009f') {
173 if (decomposition_info.should_decompose(config.decomposition_mask)) {
174 for (
auto const c : decomposition_info.decompose()) {
175 unicode_decompose(c, config, r);
179 auto const ccc = ucd_get_canonical_combining_class(code_point);
180 r += code_point | (wide_cast<char32_t>(ccc) << 24);
184constexpr void unicode_decompose(std::u32string_view text, unicode_normalize_config config,
std::u32string& r)
noexcept
186 for (
auto const c : text) {
187 unicode_decompose(c, config, r);
193 if (text.size() <= 1) {
201 while (i != text.size()) {
202 auto const code_unit = text[i++];
203 auto const code_point = code_unit & 0xff'ffff;
204 auto const combining_class = code_unit >> 24;
205 auto const first_is_starter = combining_class == 0;
207 if (code_unit == 0xffff'ffff) {
211 }
else if (first_is_starter) {
213 auto first_code_point = code_point;
214 char32_t previous_combining_class = 0;
215 for (
auto k = i; k != text.size(); ++k) {
216 auto const second_code_unit = text[k];
217 auto const second_code_point = second_code_unit & 0xff'ffff;
218 auto const second_combining_class = second_code_unit >> 24;
220 auto const blocking_pair = previous_combining_class != 0 and previous_combining_class >= second_combining_class;
221 auto const second_is_starter = second_combining_class == 0;
224 if (composed_code_point and not blocking_pair) {
226 first_code_point = *composed_code_point;
228 previous_combining_class = 0;
230 text[k] = 0xffff'ffff;
232 }
else if (second_is_starter) {
238 previous_combining_class = second_combining_class;
242 text[j++] = first_code_point;
246 text[j++] = code_point;
255 constexpr auto ccc_less = [](
char32_t a,
char32_t b) {
256 return (a >> 24) < (b >> 24);
259 auto const first = text.begin();
260 auto const last = text.end();
266 auto cluster_it = first;
267 for (
auto it = cluster_it + 1; it != last; ++it) {
268 if (*it <= 0xff'ffff) {
280 for (
auto& codePoint : text) {
281 codePoint &= 0x1f'ffff;
296 detail::unicode_decompose(text, config, r);
297 detail::unicode_reorder(r);
298 detail::unicode_clean(r);
311 detail::unicode_decompose(text, config, r);
312 detail::unicode_reorder(r);
313 detail::unicode_compose(r);
314 detail::unicode_clean(r);
323template<std::input_iterator It, std::sentinel_for<It> ItEnd>
336 if (ucd_get_canonical_combining_class(*it++) != 0) {
343 auto max_ccc = uint8_t{1};
344 for (; it != last; ++it) {
345 auto const ccc = ucd_get_canonical_combining_class(*it);
constexpr ucd_decomposition_info ucd_get_decomposition(char32_t code_point) noexcept
Get the decomposition info of a code-point.
Definition ucd_decompositions.hpp:4803
constexpr std::optional< char32_t > ucd_get_composition(char32_t cp1, char32_t cp2) noexcept
Get the composition info of two code-points.
Definition ucd_compositions.hpp:2341
DOXYGEN BUG.
Definition algorithm_misc.hpp:20
constexpr bool unicode_is_NFC_grapheme(It it, ItEnd last) noexcept
Check if the string of code-points is a single grapheme in NFC normal form.
Definition unicode_normalization.hpp:324
constexpr std::u32string unicode_normalize(std::u32string_view text, unicode_normalize_config config=unicode_normalize_config::NFC()) noexcept
Convert text to a Unicode composed normal form.
Definition unicode_normalization.hpp:308
Definition unicode_normalization.hpp:24
static constexpr unicode_normalize_config NFC_PS_noctr() noexcept
Use NFC normalization, convert all line-feed-like characters to PS.
Definition unicode_normalization.hpp:81
uint64_t decomposition_mask
The types of decompositions, that should be used when decomposing.
Definition unicode_normalization.hpp:27
std::u32string drop
Code-points to be dropped.
Definition unicode_normalization.hpp:47
uint64_t drop_C1
Drop the C1 control characters.
Definition unicode_normalization.hpp:35
std::u32string paragraph_separators
Code-points to be treated as line-separators.
Definition unicode_normalization.hpp:43
uint64_t drop_C0
Drop the C0 control characters.
Definition unicode_normalization.hpp:31
static constexpr unicode_normalize_config NFC_CRLF_noctr() noexcept
Use NFC normalization, convert all line-feed-like characters to CR-LF.
Definition unicode_normalization.hpp:94
std::u32string line_separators
Code-points to be treated as line-separators.
Definition unicode_normalization.hpp:39