7#include "ucd_decompositions.hpp"
8#include "ucd_compositions.hpp"
9#include "ucd_canonical_combining_classes.hpp"
10#include "unicode_description.hpp"
11#include "../utility/module.hpp"
12#include "../algorithm.hpp"
17namespace hi::inline
v1 {
48 char32_t line_separator_character = unicode_LS;
54 char32_t paragraph_separator_character = unicode_PS;
58 decomposition_mask |= 1_uz << to_underlying(type);
65 r.add(unicode_decomposition_type::canonical);
69 [[nodiscard]]
constexpr static unicode_normalize_config NFC() noexcept
82 r.paragraph_separators = U
"\n\v\f\u0085\u2028\u2029";
83 r.paragraph_separator_character = U
'\u2029';
95 r.paragraph_separators = U
"\n\v\f\u0085\u2028\u2029";
96 r.paragraph_separator_character = U
'\r';
102 auto r = unicode_normalize_config::NFD();
103 r.add(unicode_decomposition_type::canonical);
104 r.add(unicode_decomposition_type::font);
105 r.add(unicode_decomposition_type::noBreak);
106 r.add(unicode_decomposition_type::initial);
107 r.add(unicode_decomposition_type::medial);
108 r.add(unicode_decomposition_type::_final);
109 r.add(unicode_decomposition_type::isolated);
110 r.add(unicode_decomposition_type::circle);
111 r.add(unicode_decomposition_type::super);
112 r.add(unicode_decomposition_type::sub);
113 r.add(unicode_decomposition_type::fraction);
114 r.add(unicode_decomposition_type::vertical);
115 r.add(unicode_decomposition_type::wide);
116 r.add(unicode_decomposition_type::narrow);
117 r.add(unicode_decomposition_type::small);
118 r.add(unicode_decomposition_type::square);
119 r.add(unicode_decomposition_type::compat);
123 [[nodiscard]]
constexpr static unicode_normalize_config NFKC() noexcept
131constexpr void unicode_decompose(
char32_t code_point, unicode_normalize_config config,
std::u32string& r)
noexcept
133 for (
hilet c : config.line_separators) {
134 if (code_point == c) {
135 r += config.line_separator_character;
136 if (config.line_separator_character == unicode_CR) {
143 for (
hilet c : config.paragraph_separators) {
144 if (code_point == c) {
145 r += config.paragraph_separator_character;
146 if (config.paragraph_separator_character == unicode_CR) {
153 for (
hilet c : config.drop) {
154 if (code_point == c) {
159 if (config.drop_C0 and ((code_point >= U
'\u0000' and code_point <= U
'\u001f') or code_point == U
'\u007f')) {
163 if (config.drop_C1 and code_point >= U
'\u0080' and code_point <= U
'\u009f') {
168 if (decomposition_info.should_decompose(config.decomposition_mask)) {
169 for (
hilet c : decomposition_info.decompose()) {
170 unicode_decompose(c, config, r);
174 hilet ccc = ucd_get_canonical_combining_class(code_point);
175 r += code_point | (wide_cast<char32_t>(ccc) << 24);
179constexpr void unicode_decompose(std::u32string_view text, unicode_normalize_config config,
std::u32string& r)
noexcept
181 for (
hilet c : text) {
182 unicode_decompose(c, config, r);
188 if (text.
size() <= 1) {
196 while (i != text.
size()) {
197 hilet code_unit = text[i++];
198 hilet code_point = code_unit & 0xff'ffff;
199 hilet combining_class = code_unit >> 24;
200 hilet first_is_starter = combining_class == 0;
202 if (code_unit == 0xffff'ffff) {
206 }
else if (first_is_starter) {
208 auto first_code_point = code_point;
209 char32_t previous_combining_class = 0;
210 for (
auto k = i; k != text.
size(); ++k) {
211 hilet second_code_unit = text[k];
212 hilet second_code_point = second_code_unit & 0xff'ffff;
213 hilet second_combining_class = second_code_unit >> 24;
215 hilet blocking_pair = previous_combining_class != 0 and previous_combining_class >= second_combining_class;
216 hilet second_is_starter = second_combining_class == 0;
219 if (composed_code_point and not blocking_pair) {
221 first_code_point = *composed_code_point;
223 previous_combining_class = 0;
225 text[k] = 0xffff'ffff;
227 }
else if (second_is_starter) {
233 previous_combining_class = second_combining_class;
237 text[j++] = first_code_point;
241 text[j++] = code_point;
250 constexpr auto ccc_less = [](
char32_t a,
char32_t b) {
251 return (a >> 24) < (b >> 24);
261 auto cluster_it = first;
262 for (
auto it = cluster_it + 1; it != last; ++it) {
263 if (*it <= 0xff'ffff) {
275 for (
auto& codePoint : text) {
276 codePoint &= 0x1f'ffff;
291 detail::unicode_decompose(text, config, r);
292 detail::unicode_reorder(r);
293 detail::unicode_clean(r);
306 detail::unicode_decompose(text, config, r);
307 detail::unicode_reorder(r);
308 detail::unicode_compose(r);
309 detail::unicode_clean(r);
#define hilet
Invariant should be the default for variables.
Definition utility.hpp:23
DOXYGEN BUG.
Definition algorithm.hpp:13
constexpr std::u32string unicode_normalize(std::u32string_view text, unicode_normalize_config config=unicode_normalize_config::NFC()) noexcept
Convert text to a Unicode composed normal form.
Definition unicode_normalization.hpp:303
constexpr ucd_decomposition_info ucd_get_decomposition(char32_t code_point) noexcept
Get the decomposition info of a code-point.
Definition ucd_decompositions.hpp:4798
constexpr std::optional< char32_t > ucd_get_composition(char32_t cp1, char32_t cp2) noexcept
Get the composition info of two code-points.
Definition ucd_compositions.hpp:2336
Definition unicode_normalization.hpp:19
static constexpr unicode_normalize_config NFC_PS_noctr() noexcept
Use NFC normalization, convert all line-feed-like characters to PS.
Definition unicode_normalization.hpp:76
uint64_t decomposition_mask
The types of decompositions, that should be used when decomposing.
Definition unicode_normalization.hpp:22
std::u32string drop
Code-points to be dropped.
Definition unicode_normalization.hpp:42
uint64_t drop_C1
Drop the C1 control characters.
Definition unicode_normalization.hpp:30
std::u32string paragraph_separators
Code-points to be treated as line-separators.
Definition unicode_normalization.hpp:38
uint64_t drop_C0
Drop the C0 control characters.
Definition unicode_normalization.hpp:26
static constexpr unicode_normalize_config NFC_CRLF_noctr() noexcept
Use NFC normalization, convert all line-feed-like characters to CR-LF.
Definition unicode_normalization.hpp:89
std::u32string line_separators
Code-points to be treated as line-separators.
Definition unicode_normalization.hpp:34