HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
unicode_description.hpp
1// Copyright Take Vos 2020-2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "unicode_general_category.hpp"
8#include "unicode_bidi_class.hpp"
9#include "unicode_bidi_bracket_type.hpp"
10#include "unicode_grapheme_cluster_break.hpp"
11#include "unicode_line_break.hpp"
15#include "unicode_decomposition_type.hpp"
16#include "unicode_script.hpp"
17#include "../required.hpp"
18#include "../assert.hpp"
19#include "../cast.hpp"
20
21namespace hi::inline v1 {
22namespace detail {
23
24constexpr char32_t unicode_hangul_S_base = U'\uac00';
25constexpr char32_t unicode_hangul_L_base = U'\u1100';
26constexpr char32_t unicode_hangul_V_base = U'\u1161';
27constexpr char32_t unicode_hangul_T_base = U'\u11a7';
28constexpr char32_t unicode_hangul_L_count = 19;
29constexpr char32_t unicode_hangul_V_count = 21;
30constexpr char32_t unicode_hangul_T_count = 28;
31constexpr char32_t unicode_hangul_N_count = unicode_hangul_V_count * unicode_hangul_T_count;
32constexpr char32_t unicode_hangul_S_count = unicode_hangul_L_count * unicode_hangul_N_count;
33} // namespace detail
34
35constexpr char32_t unicode_replacement_character = U'\ufffd';
36constexpr char32_t unicode_LS = U'\u2028';
37constexpr char32_t unicode_PS = U'\u2029';
38
39[[nodiscard]] constexpr bool is_hangul_L_part(char32_t code_point) noexcept
40{
41 return code_point >= detail::unicode_hangul_L_base &&
42 code_point < (detail::unicode_hangul_L_base + detail::unicode_hangul_L_count);
43}
44
45[[nodiscard]] constexpr bool is_hangul_V_part(char32_t code_point) noexcept
46{
47 return code_point >= detail::unicode_hangul_V_base &&
48 code_point < (detail::unicode_hangul_V_base + detail::unicode_hangul_V_count);
49}
50
51[[nodiscard]] constexpr bool is_hangul_T_part(char32_t code_point) noexcept
52{
53 return code_point >= detail::unicode_hangul_T_base &&
54 code_point < (detail::unicode_hangul_T_base + detail::unicode_hangul_T_count);
55}
56
57[[nodiscard]] constexpr bool is_hangul_syllable(char32_t code_point) noexcept
58{
59 return code_point >= detail::unicode_hangul_S_base &&
60 code_point < (detail::unicode_hangul_S_base + detail::unicode_hangul_S_count);
61}
62
63[[nodiscard]] constexpr bool is_hangul_LV_part(char32_t code_point) noexcept
64{
65 return is_hangul_syllable(code_point) && ((code_point - detail::unicode_hangul_S_base) % detail::unicode_hangul_T_count) == 0;
66}
67
68[[nodiscard]] constexpr bool is_hangul_LVT_part(char32_t code_point) noexcept
69{
70 return is_hangul_syllable(code_point) && ((code_point - detail::unicode_hangul_S_base) % detail::unicode_hangul_T_count) != 0;
71}
72
80public:
81 constexpr unicode_description() noexcept = default;
83 unicode_description &operator=(unicode_description const &) = delete;
84 constexpr unicode_description(unicode_description &&) noexcept = default;
85 constexpr unicode_description &operator=(unicode_description &&) noexcept = default;
86
87 [[nodiscard]] constexpr unicode_description(
88 char32_t code_point,
89 unicode_general_category general_category,
90 unicode_grapheme_cluster_break grapheme_cluster_break,
91 unicode_line_break_class line_break_class,
92 unicode_word_break_property word_break_property,
93 unicode_sentence_break_property sentence_break_property,
94 unicode_east_asian_width east_asian_width,
95 unicode_script script,
96 unicode_bidi_class bidi_class,
97 unicode_bidi_bracket_type bidi_bracket_type,
98 char32_t bidi_mirrored_glyph,
99 unicode_decomposition_type decomposition_type,
100 bool is_canonical_composition,
101 uint8_t canonical_combining_class,
102 uint8_t decomposition_length,
103 uint32_t decomposition_index,
104 uint16_t non_starter_code) noexcept :
105 _general_info(
106 (static_cast<uint32_t>(code_point) << code_point_shift) |
107 (static_cast<uint32_t>(general_category) << general_category_shift) |
108 (static_cast<uint32_t>(grapheme_cluster_break) << grapheme_cluster_break_shift) |
109 (static_cast<uint32_t>(is_canonical_composition) << is_canonical_composition_shift) |
110 (static_cast<uint32_t>(canonical_combining_class != 0) << is_combining_mark_shift)),
111 _bidi_class(to_underlying(bidi_class)),
112 _east_asian_width(static_cast<uint32_t>(east_asian_width)),
113 _line_break_class(to_underlying(line_break_class)),
114 _word_break_property(to_underlying(word_break_property)),
115 _sentence_break_property(to_underlying(sentence_break_property)),
116 _decomposition_index(static_cast<uint32_t>(decomposition_index)),
117 _decomposition_type(static_cast<uint32_t>(decomposition_type)),
118 _decomposition_length(static_cast<uint32_t>(decomposition_length))
119 {
120 // Check if the delta fits.
121 if (canonical_combining_class == 0) {
122 _non_mark.bidi_bracket_type = static_cast<uint32_t>(bidi_bracket_type);
123 _non_mark.script = static_cast<uint32_t>(script);
124
125 if (bidi_bracket_type != unicode_bidi_bracket_type::n and bidi_mirrored_glyph != char32_t{0xffff}) {
126 auto mirrored_glyph_delta = static_cast<int32_t>(bidi_mirrored_glyph) - static_cast<int32_t>(code_point);
127 hi_axiom(mirrored_glyph_delta >= bidi_mirrored_glyph_min and mirrored_glyph_delta <= bidi_mirrored_glyph_max);
128 _non_mark.bidi_mirrored_glyph = static_cast<uint32_t>(mirrored_glyph_delta) & bidi_mirrored_glyph_mask;
129
130 } else {
131 _non_mark.bidi_mirrored_glyph = bidi_mirrored_glyph_null;
132 }
133 _non_mark._reserved = 0;
134
135 } else {
136 _mark.canonical_combining_class = canonical_combining_class;
137 _mark.non_starter_code = static_cast<uint32_t>(non_starter_code);
138 _mark._reserved = 0;
139 }
140
141 hi_axiom(code_point <= 0x10ffff);
142 hi_axiom(to_underlying(general_category) <= 0x1f);
143 hi_axiom(to_underlying(grapheme_cluster_break) <= 0x0f);
144 hi_axiom(to_underlying(line_break_class) <= 0x3f);
145 hi_axiom(to_underlying(word_break_property) <= 0x1f);
146 hi_axiom(to_underlying(sentence_break_property) <= 0xf);
147 hi_axiom(to_underlying(east_asian_width) <= 0x7);
148 hi_axiom(to_underlying(script) <= 0xff);
149 hi_axiom(to_underlying(bidi_class) <= 0x1f);
150 hi_axiom(to_underlying(bidi_bracket_type) <= 0x03);
151 hi_axiom(static_cast<uint32_t>(bidi_mirrored_glyph) <= 0x10ffff);
152 hi_axiom(static_cast<uint32_t>(canonical_combining_class) <= 0xff);
153 hi_axiom(to_underlying(decomposition_type) <= 0x7);
154 hi_axiom(static_cast<uint32_t>(decomposition_length) <= 0x1f);
155 hi_axiom(static_cast<uint32_t>(decomposition_index) <= 0x1f'ffff);
156 hi_axiom(static_cast<uint32_t>(non_starter_code) <= 0x3ff);
157 }
158
159 [[nodiscard]] static constexpr unicode_description make_unassigned(unicode_description const &other)
160 {
161 auto r = unicode_description{};
162 r._general_info = other._general_info;
163 r._bidi_class = other._bidi_class;
164 if (other.is_combining_mark()) {
165 r._mark = other._mark;
166 } else {
167 r._non_mark = other._non_mark;
168 }
169 r._east_asian_width = other._east_asian_width;
170 r._line_break_class = other._line_break_class;
171 r._word_break_property = other._word_break_property;
172 r._sentence_break_property = other._sentence_break_property;
173 r._decomposition_index = other._decomposition_index;
174 r._decomposition_type = other._decomposition_type;
175 r._decomposition_length = other._decomposition_length;
176
177 r._general_info &= ~(general_category_mask << general_category_shift);
178 r._general_info |= static_cast<uint32_t>(to_underlying(unicode_general_category::Cn)) << general_category_shift;
179 return r;
180 }
181
185 [[nodiscard]] constexpr char32_t code_point() const noexcept
186 {
187 return static_cast<char32_t>((_general_info >> code_point_shift) & code_point_mask);
188 }
189
196 [[nodiscard]] constexpr unicode_general_category general_category() const noexcept
197 {
198 return static_cast<unicode_general_category>((_general_info >> general_category_shift) & general_category_mask);
199 }
200
207 [[nodiscard]] constexpr unicode_grapheme_cluster_break grapheme_cluster_break() const noexcept
208 {
209 return static_cast<unicode_grapheme_cluster_break>(
210 (_general_info >> grapheme_cluster_break_shift) & grapheme_cluster_break_mask);
211 }
212
213 [[nodiscard]] constexpr bool is_canonical_composition() const noexcept
214 {
215 return static_cast<bool>((_general_info >> is_canonical_composition_shift) & is_canonical_composition_mask);
216 }
217
218 [[nodiscard]] constexpr bool is_combining_mark() const noexcept
219 {
220 return static_cast<bool>((_general_info >> is_combining_mark_shift) & is_combining_mark_mask);
221 }
222
223 [[nodiscard]] constexpr unicode_line_break_class line_break_class() const noexcept
224 {
225 return static_cast<unicode_line_break_class>(_line_break_class);
226 }
227
228 [[nodiscard]] constexpr unicode_word_break_property word_break_property() const noexcept
229 {
230 return static_cast<unicode_word_break_property>(_word_break_property);
231 }
232
233 [[nodiscard]] constexpr unicode_sentence_break_property sentence_break_property() const noexcept
234 {
235 return static_cast<unicode_sentence_break_property>(_sentence_break_property);
236 }
237
238 [[nodiscard]] constexpr unicode_east_asian_width east_asian_width() const noexcept
239 {
240 return static_cast<unicode_east_asian_width>(_east_asian_width);
241 }
242
249 [[nodiscard]] constexpr unicode_bidi_class bidi_class() const noexcept
250 {
251 return static_cast<unicode_bidi_class>(_bidi_class);
252 }
253
256 [[nodiscard]] constexpr unicode_script script() const noexcept
257 {
258 if (is_combining_mark()) {
259 return unicode_script::Common;
260 } else {
261 return static_cast<unicode_script>(_non_mark.script);
262 }
263 }
264
271 [[nodiscard]] constexpr unicode_bidi_bracket_type bidi_bracket_type() const noexcept
272 {
273 if (is_combining_mark()) {
274 return unicode_bidi_bracket_type::n;
275 } else {
276 return static_cast<unicode_bidi_bracket_type>(_non_mark.bidi_bracket_type);
277 }
278 }
279
283 [[nodiscard]] constexpr char32_t bidi_mirrored_glyph() const noexcept
284 {
285 if (bidi_bracket_type() == unicode_bidi_bracket_type::n or _non_mark.bidi_mirrored_glyph == bidi_mirrored_glyph_null) {
286 return 0xffff;
287 }
288
289 constexpr auto sign_extent_shift = 32 - bidi_mirrored_glyph_width;
290
291 hilet mirrored_glyph_delta =
292 static_cast<int32_t>(_non_mark.bidi_mirrored_glyph << sign_extent_shift) >> sign_extent_shift;
293 hilet cp = code_point();
294 hilet mirror_cp = static_cast<char32_t>(cp + mirrored_glyph_delta);
295 return mirror_cp;
296 }
297
301 [[nodiscard]] constexpr unicode_decomposition_type decomposition_type() const noexcept
302 {
303 return static_cast<unicode_decomposition_type>(_decomposition_type);
304 }
305
315 [[nodiscard]] constexpr uint8_t canonical_combining_class() const noexcept
316 {
317 if (is_combining_mark()) {
318 return static_cast<uint8_t>(_mark.canonical_combining_class);
319 } else {
320 return 0;
321 }
322 }
323
335 [[nodiscard]] constexpr std::size_t decomposition_length() const noexcept
336 {
337 return static_cast<std::size_t>(_decomposition_length);
338 }
339
351 [[nodiscard]] constexpr std::size_t decomposition_index() const noexcept
352 {
353 return static_cast<std::size_t>(_decomposition_index);
354 }
355
362 [[nodiscard]] constexpr char32_t canonical_equivalent() const noexcept
363 {
364 if (decomposition_type() == unicode_decomposition_type::canonical and _decomposition_length == 1) {
365 return static_cast<char32_t>(_decomposition_index);
366 } else {
367 return U'\uffff';
368 }
369 }
370
378 [[nodiscard]] constexpr size_t non_starter_code() const noexcept
379 {
380 hi_axiom(is_combining_mark());
381 return static_cast<size_t>(_mark.non_starter_code);
382 }
383
394 [[nodiscard]] static unicode_description const &find(char32_t code_point) noexcept;
395
396 [[nodiscard]] friend bool operator==(unicode_description const &lhs, unicode_general_category const &rhs) noexcept
397 {
398 return lhs.general_category() == rhs;
399 }
400
401 [[nodiscard]] friend bool operator==(unicode_description const &lhs, unicode_decomposition_type const &rhs) noexcept
402 {
403 return lhs.decomposition_type() == rhs;
404 }
405
406 [[nodiscard]] friend bool operator==(unicode_description const &lhs, unicode_bidi_bracket_type const &rhs) noexcept
407 {
408 return lhs.bidi_bracket_type() == rhs;
409 }
410
411 [[nodiscard]] friend bool operator==(unicode_description const &lhs, unicode_bidi_class const &rhs) noexcept
412 {
413 return lhs.bidi_class() == rhs;
414 }
415 [[nodiscard]] friend bool operator==(unicode_description const &lhs, unicode_east_asian_width const &rhs) noexcept
416 {
417 return lhs.east_asian_width() == rhs;
418 }
419
420 [[nodiscard]] friend bool operator==(unicode_description const &lhs, unicode_sentence_break_property const &rhs) noexcept
421 {
422 return lhs.sentence_break_property() == rhs;
423 }
424
425 [[nodiscard]] friend bool operator==(unicode_description const &lhs, unicode_line_break_class const &rhs) noexcept
426 {
427 return lhs.line_break_class() == rhs;
428 }
429
430 [[nodiscard]] friend bool operator==(unicode_description const &lhs, unicode_word_break_property const &rhs) noexcept
431 {
432 return lhs.word_break_property() == rhs;
433 }
434
435 [[nodiscard]] friend bool operator==(unicode_description const &lhs, unicode_grapheme_cluster_break const &rhs) noexcept
436 {
437 return lhs.grapheme_cluster_break() == rhs;
438 }
439
440 [[nodiscard]] friend bool operator==(unicode_description const &lhs, char32_t const &rhs) noexcept
441 {
442 return lhs.code_point() == rhs;
443 }
444
445 [[nodiscard]] friend bool is_C(unicode_description const &rhs) noexcept
446 {
447 return is_C(rhs.general_category());
448 }
449
450private:
451 static constexpr uint32_t code_point_shift = 11;
452 static constexpr uint32_t code_point_mask = 0x1f'ffff;
453 static constexpr uint32_t general_category_shift = 6;
454 static constexpr uint32_t general_category_mask = 0x1f;
455 static constexpr uint32_t grapheme_cluster_break_shift = 2;
456 static constexpr uint32_t grapheme_cluster_break_mask = 0xf;
457 static constexpr uint32_t is_canonical_composition_shift = 1;
458 static constexpr uint32_t is_canonical_composition_mask = 0x1;
459 static constexpr uint32_t is_combining_mark_shift = 0;
460 static constexpr uint32_t is_combining_mark_mask = 0x1;
461
462 static constexpr uint32_t bidi_mirrored_glyph_mask = 0x1fff;
463 static constexpr uint32_t bidi_mirrored_glyph_width = std::bit_width(bidi_mirrored_glyph_mask);
464 static constexpr int32_t bidi_mirrored_glyph_max = static_cast<int32_t>(bidi_mirrored_glyph_mask >> 1);
465 static constexpr int32_t bidi_mirrored_glyph_min = -bidi_mirrored_glyph_max;
466 static constexpr uint32_t bidi_mirrored_glyph_null =
467 static_cast<uint32_t>(bidi_mirrored_glyph_min - 1) & bidi_mirrored_glyph_mask;
468
469 // 1st dword
470 // We don't use bit-fields so we can do binary-search without needing shift- & and-operations
471 // code_point must be in msb for correct binary search.
472 // [31:11] code-point
473 // [10:6] general category
474 // [5:2] grapheme cluster break
475 // [1] is_canonical_composition
476 // [0] is_combining_mark
477 uint32_t _general_info;
478
479 // 2nd dword
480 uint32_t _bidi_class : 5;
481 uint32_t _word_break_property : 5;
482 uint32_t _line_break_class : 6;
483 uint32_t _sentence_break_property : 4;
484 uint32_t _word2_reserved : 12 = 0;
485
486 struct mark_type {
487 uint32_t canonical_combining_class : 8;
488 uint32_t non_starter_code : 10;
489 uint32_t _reserved : 14;
490 };
491
492 struct non_mark_type {
493 uint32_t bidi_mirrored_glyph : 13;
494 uint32_t bidi_bracket_type : 2;
495 uint32_t script : 8;
496 uint32_t _reserved : 9;
497 };
498
499 // 3rd dword
500 union {
501 mark_type _mark;
502 non_mark_type _non_mark;
503 uint32_t _word3 = 0;
504 };
505
506 // 4th dword
507 uint32_t _decomposition_index : 21;
508 uint32_t _decomposition_type : 3;
509 uint32_t _decomposition_length : 5;
510 uint32_t _east_asian_width : 3;
511};
512
513static_assert(sizeof(unicode_description) == 16);
514
515} // namespace hi::inline v1
This file includes required definitions.
#define hilet
Invariant should be the default for variables.
Definition required.hpp:23
Description of a unicode code point.
Definition unicode_description.hpp:79
constexpr size_t non_starter_code() const noexcept
Get the non-starter-code.
Definition unicode_description.hpp:378
constexpr unicode_grapheme_cluster_break grapheme_cluster_break() const noexcept
The grapheme cluster break of this code-point.
Definition unicode_description.hpp:207
constexpr char32_t bidi_mirrored_glyph() const noexcept
Get the mirrored glyph.
Definition unicode_description.hpp:283
constexpr unicode_bidi_class bidi_class() const noexcept
The bidi class of this code-point This function is used by the bidirectional algorithm to figure out ...
Definition unicode_description.hpp:249
constexpr unicode_bidi_bracket_type bidi_bracket_type() const noexcept
Get the bidi bracket type.
Definition unicode_description.hpp:271
constexpr char32_t code_point() const noexcept
The code point of the description.
Definition unicode_description.hpp:185
constexpr uint8_t canonical_combining_class() const noexcept
Get the combining class.
Definition unicode_description.hpp:315
constexpr std::size_t decomposition_length() const noexcept
The number of code-points the decomposed grapheme has.
Definition unicode_description.hpp:335
constexpr unicode_general_category general_category() const noexcept
The general category of this code-point.
Definition unicode_description.hpp:196
static unicode_description const & find(char32_t code_point) noexcept
Find a code-point in the global unicode_description table.
constexpr char32_t canonical_equivalent() const noexcept
Get the canonical equivalent of this code-point.
Definition unicode_description.hpp:362
constexpr unicode_script script() const noexcept
Get the script of this character.
Definition unicode_description.hpp:256
constexpr unicode_decomposition_type decomposition_type() const noexcept
This character has a canonical decomposition.
Definition unicode_description.hpp:301
constexpr std::size_t decomposition_index() const noexcept
A multi-use value representing the decomposition of this code-point.
Definition unicode_description.hpp:351