7#include "ucd_decompositions.hpp"
8#include "ucd_compositions.hpp"
9#include "ucd_canonical_combining_classes.hpp"
10#include "unicode_description.hpp"
11#include "../algorithm/module.hpp"
12#include "../utility/utility.hpp"
13#include "../macros.hpp"
51 char32_t line_separator_character = unicode_LS;
57 char32_t paragraph_separator_character = unicode_PS;
61 decomposition_mask |= 1_uz << std::to_underlying(type);
68 r.add(unicode_decomposition_type::canonical);
72 [[nodiscard]]
constexpr static unicode_normalize_config NFC() noexcept
85 r.paragraph_separators = U
"\n\v\f\u0085\u2028\u2029";
86 r.paragraph_separator_character = U
'\u2029';
98 r.paragraph_separators = U
"\n\v\f\u0085\u2028\u2029";
99 r.paragraph_separator_character = U
'\r';
105 auto r = unicode_normalize_config::NFD();
106 r.add(unicode_decomposition_type::canonical);
107 r.add(unicode_decomposition_type::font);
108 r.add(unicode_decomposition_type::noBreak);
109 r.add(unicode_decomposition_type::initial);
110 r.add(unicode_decomposition_type::medial);
111 r.add(unicode_decomposition_type::_final);
112 r.add(unicode_decomposition_type::isolated);
113 r.add(unicode_decomposition_type::circle);
114 r.add(unicode_decomposition_type::super);
115 r.add(unicode_decomposition_type::sub);
116 r.add(unicode_decomposition_type::fraction);
117 r.add(unicode_decomposition_type::vertical);
118 r.add(unicode_decomposition_type::wide);
119 r.add(unicode_decomposition_type::narrow);
120 r.add(unicode_decomposition_type::small);
121 r.add(unicode_decomposition_type::square);
122 r.add(unicode_decomposition_type::compat);
126 [[nodiscard]]
constexpr static unicode_normalize_config NFKC() noexcept
134constexpr void unicode_decompose(
char32_t code_point, unicode_normalize_config config,
std::u32string& r)
noexcept
136 for (hilet c : config.line_separators) {
137 if (code_point == c) {
138 r += config.line_separator_character;
139 if (config.line_separator_character == unicode_CR) {
146 for (hilet c : config.paragraph_separators) {
147 if (code_point == c) {
148 r += config.paragraph_separator_character;
149 if (config.paragraph_separator_character == unicode_CR) {
156 for (hilet c : config.drop) {
157 if (code_point == c) {
162 if (config.drop_C0 and ((code_point >= U
'\u0000' and code_point <= U
'\u001f') or code_point == U
'\u007f')) {
166 if (config.drop_C1 and code_point >= U
'\u0080' and code_point <= U
'\u009f') {
171 if (decomposition_info.should_decompose(config.decomposition_mask)) {
172 for (hilet c : decomposition_info.decompose()) {
173 unicode_decompose(c, config, r);
177 hilet ccc = ucd_get_canonical_combining_class(code_point);
178 r += code_point | (wide_cast<char32_t>(ccc) << 24);
182constexpr void unicode_decompose(std::u32string_view text, unicode_normalize_config config,
std::u32string& r)
noexcept
184 for (hilet c : text) {
185 unicode_decompose(c, config, r);
191 if (text.size() <= 1) {
199 while (i != text.size()) {
200 hilet code_unit = text[i++];
201 hilet code_point = code_unit & 0xff'ffff;
202 hilet combining_class = code_unit >> 24;
203 hilet first_is_starter = combining_class == 0;
205 if (code_unit == 0xffff'ffff) {
209 }
else if (first_is_starter) {
211 auto first_code_point = code_point;
212 char32_t previous_combining_class = 0;
213 for (
auto k = i; k != text.size(); ++k) {
214 hilet second_code_unit = text[k];
215 hilet second_code_point = second_code_unit & 0xff'ffff;
216 hilet second_combining_class = second_code_unit >> 24;
218 hilet blocking_pair = previous_combining_class != 0 and previous_combining_class >= second_combining_class;
219 hilet second_is_starter = second_combining_class == 0;
222 if (composed_code_point and not blocking_pair) {
224 first_code_point = *composed_code_point;
226 previous_combining_class = 0;
228 text[k] = 0xffff'ffff;
230 }
else if (second_is_starter) {
236 previous_combining_class = second_combining_class;
240 text[j++] = first_code_point;
244 text[j++] = code_point;
253 constexpr auto ccc_less = [](
char32_t a,
char32_t b) {
254 return (a >> 24) < (b >> 24);
257 hilet first = text.begin();
258 hilet last = text.end();
264 auto cluster_it = first;
265 for (
auto it = cluster_it + 1; it != last; ++it) {
266 if (*it <= 0xff'ffff) {
278 for (
auto& codePoint : text) {
279 codePoint &= 0x1f'ffff;
294 detail::unicode_decompose(text,
config, r);
295 detail::unicode_reorder(r);
296 detail::unicode_clean(r);
309 detail::unicode_decompose(text,
config, r);
310 detail::unicode_reorder(r);
311 detail::unicode_compose(r);
312 detail::unicode_clean(r);
321template<std::input_iterator It, std::sentinel_for<It> ItEnd>
334 if (ucd_get_canonical_combining_class(*
it++) != 0) {
342 for (;
it != last; ++
it) {
343 hilet
ccc = ucd_get_canonical_combining_class(*
it);
DOXYGEN BUG.
Definition algorithm.hpp:16
constexpr bool unicode_is_NFC_grapheme(It it, ItEnd last) noexcept
Check if the string of code-points is a single grapheme in NFC normal form.
Definition unicode_normalization.hpp:322
constexpr std::u32string unicode_normalize(std::u32string_view text, unicode_normalize_config config=unicode_normalize_config::NFC()) noexcept
Convert text to a Unicode composed normal form.
Definition unicode_normalization.hpp:306
constexpr ucd_decomposition_info ucd_get_decomposition(char32_t code_point) noexcept
Get the decomposition info of a code-point.
Definition ucd_decompositions.hpp:4800
constexpr Out narrow_cast(In const &rhs) noexcept
Cast numeric values without loss of precision.
Definition cast.hpp:377
constexpr std::optional< char32_t > ucd_get_composition(char32_t cp1, char32_t cp2) noexcept
Get the composition info of two code-points.
Definition ucd_compositions.hpp:2338
Definition unicode_normalization.hpp:22
static constexpr unicode_normalize_config NFC_PS_noctr() noexcept
Use NFC normalization, convert all line-feed-like characters to PS.
Definition unicode_normalization.hpp:79
uint64_t decomposition_mask
The types of decompositions, that should be used when decomposing.
Definition unicode_normalization.hpp:25
std::u32string drop
Code-points to be dropped.
Definition unicode_normalization.hpp:45
uint64_t drop_C1
Drop the C1 control characters.
Definition unicode_normalization.hpp:33
std::u32string paragraph_separators
Code-points to be treated as line-separators.
Definition unicode_normalization.hpp:41
uint64_t drop_C0
Drop the C0 control characters.
Definition unicode_normalization.hpp:29
static constexpr unicode_normalize_config NFC_CRLF_noctr() noexcept
Use NFC normalization, convert all line-feed-like characters to CR-LF.
Definition unicode_normalization.hpp:92
std::u32string line_separators
Code-points to be treated as line-separators.
Definition unicode_normalization.hpp:37