HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
grapheme.hpp
1// Copyright Take Vos 2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "../utility/utility.hpp"
8#include "../i18n/i18n.hpp"
9#include "../telemetry/telemetry.hpp"
10#include "../concurrency/concurrency.hpp"
11#include "unicode_normalization.hpp"
12#include "ucd_general_categories.hpp"
13#include "ucd_canonical_combining_classes.hpp"
14#include "ucd_scripts.hpp"
15#include "phrasing.hpp"
16#include "../macros.hpp"
17#include <cstdint>
18#include <string>
19#include <string_view>
20#include <cstddef>
21#include <memory>
22#include <vector>
23#include <algorithm>
24#include <bit>
25#include <array>
26#include <atomic>
27
28namespace hi::inline v1 {
29namespace detail {
30
32public:
33 long_grapheme_table() = default;
36 long_grapheme_table& operator=(long_grapheme_table const&) = delete;
37 long_grapheme_table& operator=(long_grapheme_table&&) = delete;
38
47 [[nodiscard]] std::u32string get_grapheme(uint32_t start) const noexcept
48 {
49 // If `start` came from another thread it will have been transferred
50 // to this thread pr
51 auto src = std::addressof(_table[start]);
52 hilet length = *src >> 21;
53
54 auto r = std::u32string{};
55 r.resize_and_overwrite(length, [&](char32_t *dst, size_t count) {
56 std::copy_n(src, count, dst);
57 *dst &= 0x1f'ffff;
58 return count;
59 });
60 return r;
61 }
62
71 [[nodiscard]] size_t get_grapheme_size(uint32_t start) const noexcept
72 {
73 return _table[start] >> 21;
74 }
75
84 [[nodiscard]] char32_t get_grapheme_starter(uint32_t start) const noexcept
85 {
86 return char32_t{_table[start] & 0x1f'ffff};
87 }
88
96 template<typename CodePoints>
97 [[nodiscard]] int32_t add_grapheme(CodePoints&& code_points) noexcept
98 {
99 static_assert(std::is_same_v<typename std::remove_cvref_t<CodePoints>::value_type, char32_t>);
100
101 hi_axiom(code_points.size() >= 2);
102 hi_axiom(unicode_is_NFC_grapheme(code_points.cbegin(), code_points.cend()));
103
104 hilet lock = std::scoped_lock(_mutex);
105
106 // See if this grapheme already exists and return its index.
107 if (hilet it = _indices.find(code_points); it != _indices.end()) {
108 return it->second;
109 }
110
111 // Check if there is enough room in the table to add the code-points.
112 if (_head + code_points.size() >= _table.size()) {
113 return -1;
114 }
115
116 hilet insert_index = _head;
117 _head += narrow_cast<uint32_t>(code_points.size());
118
119 // Copy the grapheme into the table, and set the size on the first entry.
120 std::copy(code_points.cbegin(), code_points.cend(), _table.begin() + insert_index);
121 _table[insert_index] |= char_cast<char32_t>(code_points.size() << 21);
122
123 // Add the grapheme to the quickly searchable index table.
124 _indices.emplace(std::forward<CodePoints>(code_points), insert_index);
125
126 return insert_index;
127 }
128
129private:
130 mutable unfair_mutex _mutex = {};
131 uint32_t _head = {};
132
138 std::array<char32_t, 0x0f'0000> _table = {};
139
141};
142
143inline long_grapheme_table long_graphemes = {};
144
145} // namespace detail
146
147struct composed_t {};
148
160struct grapheme {
161 using value_type = uint64_t;
162
175 value_type _value;
176
177 constexpr grapheme() noexcept = default;
178 constexpr grapheme(grapheme const&) noexcept = default;
179 constexpr grapheme(grapheme&&) noexcept = default;
180 constexpr grapheme& operator=(grapheme const&) noexcept = default;
181 constexpr grapheme& operator=(grapheme&&) noexcept = default;
182
183 constexpr grapheme(intrinsic_t, value_type value) : _value(value) {}
184
185 constexpr value_type& intrinsic() noexcept
186 {
187 return _value;
188 }
189
190 constexpr value_type const& intrinsic() const noexcept
191 {
192 return _value;
193 }
194
197 constexpr grapheme(char32_t code_point) noexcept : _value(char_cast<value_type>(code_point))
198 {
199 hi_axiom(code_point <= 0x10'ffff);
200 hi_axiom(ucd_get_canonical_combining_class(code_point) == 0, "Single code-point must be a starter");
201 set_script();
202 }
203
204 constexpr grapheme(char ascii_char) noexcept : _value(char_cast<value_type>(ascii_char))
205 {
206 hi_axiom(ascii_char >= 0 and ascii_char <= 0x7f);
207 // All ASCII characters are starters.
208 set_script();
209 }
210
213 constexpr grapheme& operator=(char32_t code_point) noexcept
214 {
215 hi_axiom(code_point <= 0x10'ffff);
216 hi_axiom(ucd_get_canonical_combining_class(code_point) == 0, "Single code-point must be a starter");
217
218 _value = char_cast<value_type>(code_point);
219 set_script();
220 return *this;
221 }
222
225 constexpr grapheme& operator=(char ascii_char) noexcept
226 {
227 hi_axiom(ascii_char >= 0 and ascii_char <= 0x7f);
228 // All ASCII characters are starters.
229
230 _value = char_cast<value_type>(ascii_char);
231 set_script();
232 return *this;
233 }
234
239 template<typename CodePoints>
240 constexpr grapheme(composed_t, CodePoints&& code_points) noexcept
241 {
242 static_assert(std::is_same_v<typename std::remove_cvref_t<CodePoints>::value_type, char32_t>);
243
244 hi_axiom(not code_points.empty());
245 if (code_points.size() == 1) {
246 hilet code_point = code_points.front();
247 hi_axiom(code_point <= 0x10'ffff);
248 hi_axiom(ucd_get_canonical_combining_class(code_point) == 0);
249 _value = char_cast<value_type>(code_point);
250
251 } else {
252 hilet index = detail::long_graphemes.add_grapheme(std::forward<CodePoints>(code_points));
253 if (index >= 0) {
254 _value = narrow_cast<value_type>(index + 0x11'0000);
255
256 } else {
257 [[unlikely]] hi_log_error_once(
258 "grapheme::error::too-many", "Too many long graphemes encoded, replacing with U+fffd");
259 _value = char_cast<value_type>(U'\ufffd');
260 }
261 }
262 set_script();
263 }
264
269 constexpr explicit grapheme(std::u32string_view code_points) noexcept :
270 grapheme(composed_t{}, unicode_normalize(code_points, unicode_normalize_config::NFC()))
271 {
272 }
273
276 [[nodiscard]] constexpr uint32_t index() const noexcept
277 {
278 return _value & 0x1f'ffff;
279 }
280
281 [[nodiscard]] constexpr iso_639 language() const noexcept
282 {
283 return iso_639{intrinsic_t{}, narrow_cast<uint16_t>((_value >> 21) & 0x7fff)};
284 }
285
286 constexpr void set_language(iso_639 rhs) noexcept
287 {
288 hi_axiom(rhs.intrinsic() <= 0x7fff);
289
290 constexpr auto mask = ~(value_type{0x7fff} << 21);
291 _value &= mask;
292 _value |= wide_cast<value_type>(rhs.intrinsic()) << 21;
293 }
294
300 [[nodiscard]] constexpr iso_15924 starter_script() const noexcept
301 {
302 return ucd_get_script(starter());
303 }
304
312 [[nodiscard]] constexpr iso_15924 starter_script(iso_15924 default_script) const noexcept
313 {
314 hilet starter_script_ = starter_script();
315 if (starter_script_ == iso_15924::common() and starter_script_ == iso_15924::inherited()) {
316 return default_script;
317 } else {
318 return starter_script_;
319 }
320 }
321
322 [[nodiscard]] constexpr iso_15924 script() const noexcept
323 {
324 return iso_15924{intrinsic_t{}, narrow_cast<uint16_t>((_value >> 36) & 0x3ff)};
325 }
326
332 constexpr void set_script(iso_15924 rhs) noexcept
333 {
334 hilet new_script = starter_script(rhs);
335 hi_axiom(new_script.intrinsic() < 1000);
336
337 constexpr auto mask = ~(value_type{0x3ff} << 36);
338 _value &= mask;
339 _value |= wide_cast<value_type>(new_script.intrinsic()) << 36;
340 }
341
344 constexpr void set_script() noexcept
345 {
346 set_script(starter_script());
347 }
348
349 [[nodiscard]] constexpr iso_3166 region() const noexcept
350 {
351 return iso_3166{intrinsic_t{}, narrow_cast<uint16_t>((_value >> 46) & 0x3ff)};
352 }
353
354 constexpr void set_region(iso_3166 rhs) noexcept
355 {
356 hi_axiom(rhs.intrinsic() < 1000);
357
358 constexpr auto mask = ~(value_type{0x3ff} << 46);
359 _value &= mask;
360 _value |= wide_cast<value_type>(rhs.intrinsic()) << 46;
361 }
362
363 [[nodiscard]] constexpr hi::language_tag language_tag() const noexcept
364 {
365 auto tmp = _value;
366 tmp >>= 21;
367 hilet language_ = iso_639{intrinsic_t{}, narrow_cast<uint16_t>(tmp & 0x7fff)};
368 tmp >>= 15;
369 hilet script_ = iso_15924{intrinsic_t{}, narrow_cast<uint16_t>(tmp & 0x3ff)};
370 tmp >>= 10;
371 hilet region_ = iso_3166{intrinsic_t{}, narrow_cast<uint16_t>(tmp & 0x3ff)};
372 return hi::language_tag{language_, script_, region_};
373 }
374
375 constexpr void set_language_tag(hi::language_tag rhs) noexcept
376 {
377 hi_axiom(rhs.region.intrinsic() < 1000);
378 hi_axiom(rhs.language.intrinsic() <= 0x7fff);
379
380 hilet new_script = starter_script(rhs.script);
381 hi_axiom(new_script.intrinsic() < 1000);
382
383 auto tmp = wide_cast<value_type>(rhs.region.intrinsic());
384 tmp <<= 10;
385 tmp |= new_script.intrinsic();
386 tmp <<= 15;
387 tmp |= rhs.language.intrinsic();
388 tmp <<= 21;
389
390 constexpr auto mask = ~(uint64_t{0x7'ffff'ffff} << 21);
391 _value &= mask;
392 _value |= tmp;
393 }
394
395 [[nodiscard]] constexpr hi::phrasing phrasing() const noexcept
396 {
397 return static_cast<hi::phrasing>((_value >> 56) & 0x3f);
398 }
399
400 constexpr void set_phrasing(hi::phrasing rhs) noexcept
401 {
402 hi_axiom(std::to_underlying(rhs) <= 0x3f);
403
404 constexpr auto mask = ~(value_type{0x3f} << 56);
405 _value &= mask;
406 _value |= static_cast<value_type>(rhs) << 56;
407 }
408
411 [[nodiscard]] constexpr std::size_t size() const noexcept
412 {
413 if (auto i = index(); i <= 0x10'ffff) {
414 return 1_uz;
415 } else {
416 return detail::long_graphemes.get_grapheme_size(i - 0x11'0000);
417 }
418 }
419
420 [[nodiscard]] constexpr char32_t starter() const noexcept
421 {
422 if (auto i = index(); i <= 0x10'ffff) {
423 return char_cast<char32_t>(i);
424 } else {
425 return detail::long_graphemes.get_grapheme_starter(i - 0x11'0000);
426 }
427 }
428
429 [[nodiscard]] constexpr bool is_ascii() const noexcept
430 {
431 return index() <= 127;
432 }
433
436 [[nodiscard]] constexpr std::u32string composed() const noexcept
437 {
438 if (hilet i = index(); i <= 0x10'ffff) {
439 return std::u32string{char_cast<char32_t>(i)};
440 } else {
441 return detail::long_graphemes.get_grapheme(i - 0x11'0000);
442 }
443 }
444
447 [[nodiscard]] constexpr std::u32string
448 decomposed(unicode_normalize_config config = unicode_normalize_config::NFD()) const noexcept
449 {
450 return unicode_decompose(composed(), config);
451 }
452
457 [[nodiscard]] friend constexpr bool operator==(grapheme const& lhs, grapheme const& rhs) noexcept
458 {
459 return lhs.index() == rhs.index();
460 }
461
462 [[nodiscard]] friend constexpr bool operator==(grapheme const& lhs, char32_t const& rhs) noexcept
463 {
464 hi_axiom(char_cast<value_type>(rhs) <= 0x10'ffff);
465 return lhs.index() == char_cast<value_type>(rhs);
466 }
467
468 [[nodiscard]] friend constexpr bool operator==(grapheme const& lhs, char const& rhs) noexcept
469 {
470 hi_axiom(char_cast<value_type>(rhs) <= 0x7f);
471 return lhs.index() == char_cast<value_type>(rhs);
472 }
473
476 [[nodiscard]] friend constexpr std::strong_ordering operator<=>(grapheme const& lhs, grapheme const& rhs) noexcept
477 {
478 return lhs.decomposed() <=> rhs.decomposed();
479 }
480
481 [[nodiscard]] friend constexpr std::strong_ordering operator<=>(grapheme const& lhs, char32_t const& rhs) noexcept
482 {
483 return lhs <=> grapheme{rhs};
484 }
485
486 [[nodiscard]] friend constexpr std::strong_ordering operator<=>(grapheme const& lhs, char const& rhs) noexcept
487 {
488 return lhs <=> grapheme{rhs};
489 }
490
491 [[nodiscard]] friend constexpr std::string to_string(grapheme const& rhs) noexcept
492 {
493 return hi::to_string(rhs.composed());
494 }
495
496 [[nodiscard]] friend constexpr std::wstring to_wstring(grapheme const& rhs) noexcept
497 {
498 return hi::to_wstring(rhs.composed());
499 }
500
501 [[nodiscard]] friend constexpr std::u32string to_u32string(grapheme const& rhs) noexcept
502 {
503 return rhs.composed();
504 }
505};
506
507} // namespace hi::inline v1
508
509template<>
510struct std::hash<hi::grapheme> {
511 [[nodiscard]] std::size_t operator()(hi::grapheme const& rhs) const noexcept
512 {
513 return std::hash<hi::grapheme::value_type>{}(rhs._value);
514 }
515};
constexpr std::u32string to_u32string(std::u32string_view rhs) noexcept
Identity conversion from UTF-32 to UTF-32.
Definition to_string.hpp:24
constexpr std::wstring to_wstring(std::u32string_view rhs) noexcept
Conversion from UTF-32 to wide-string (UTF-16/32).
Definition to_string.hpp:156
@ grapheme
The gui_event has grapheme data.
phrasing
Phrasing.
Definition phrasing.hpp:30
DOXYGEN BUG.
Definition algorithm.hpp:16
constexpr bool unicode_is_NFC_grapheme(It it, ItEnd last) noexcept
Check if the string of code-points is a single grapheme in NFC normal form.
Definition unicode_normalization.hpp:322
constexpr std::u32string unicode_normalize(std::u32string_view text, unicode_normalize_config config=unicode_normalize_config::NFC()) noexcept
Convert text to a Unicode composed normal form.
Definition unicode_normalization.hpp:306
hi_export constexpr void set_language(It first, ItEnd last, language_tag language) noexcept
Set the language for the string.
Definition gstring.hpp:178
geometry/margins.hpp
Definition lookahead_iterator.hpp:5
constexpr Out narrow_cast(In const &rhs) noexcept
Cast numeric values without loss of precision.
Definition cast.hpp:377
ISO-3166 country code.
Definition iso_3166_intf.hpp:17
ISO-639 language code.
Definition iso_639.hpp:25
The IETF BCP 47 language tag.
Definition language_tag_intf.hpp:28
Definition grapheme.hpp:31
size_t get_grapheme_size(uint32_t start) const noexcept
Get the size of the grapheme.
Definition grapheme.hpp:71
char32_t get_grapheme_starter(uint32_t start) const noexcept
Get the starter (first) code-point of a grapheme.
Definition grapheme.hpp:84
std::u32string get_grapheme(uint32_t start) const noexcept
Get the grapheme from the table.
Definition grapheme.hpp:47
int32_t add_grapheme(CodePoints &&code_points) noexcept
Find or insert a grapheme in the table.
Definition grapheme.hpp:97
Definition grapheme.hpp:147
A grapheme-cluster, what a user thinks a character is.
Definition grapheme.hpp:160
constexpr grapheme(std::u32string_view code_points) noexcept
Encode a grapheme from a list of code-points.
Definition grapheme.hpp:269
friend constexpr bool operator==(grapheme const &lhs, grapheme const &rhs) noexcept
Compare equivalence of two graphemes.
Definition grapheme.hpp:457
constexpr void set_script() noexcept
Get the script of the grapheme to the starter script.
Definition grapheme.hpp:344
friend constexpr std::strong_ordering operator<=>(grapheme const &lhs, grapheme const &rhs) noexcept
Compare two graphemes lexicographically.
Definition grapheme.hpp:476
constexpr iso_15924 starter_script(iso_15924 default_script) const noexcept
Get the script of the starter code-point.
Definition grapheme.hpp:312
constexpr uint32_t index() const noexcept
Get the codepoint/index part of the grapheme.
Definition grapheme.hpp:276
constexpr grapheme(char32_t code_point) noexcept
Encode a single code-point.
Definition grapheme.hpp:197
constexpr grapheme & operator=(char ascii_char) noexcept
Encode a single code-point.
Definition grapheme.hpp:225
constexpr grapheme(composed_t, CodePoints &&code_points) noexcept
Encode a grapheme from a list of code-points.
Definition grapheme.hpp:240
value_type _value
The grapheme's value.
Definition grapheme.hpp:175
constexpr std::u32string decomposed(unicode_normalize_config config=unicode_normalize_config::NFD()) const noexcept
Get a list of code-point normalized to NFD.
Definition grapheme.hpp:448
constexpr iso_15924 starter_script() const noexcept
Get the script of the starter code-point.
Definition grapheme.hpp:300
constexpr std::u32string composed() const noexcept
Get a list of code-point normalized to NFC.
Definition grapheme.hpp:436
constexpr void set_script(iso_15924 rhs) noexcept
Set the script of the grapheme.
Definition grapheme.hpp:332
constexpr grapheme & operator=(char32_t code_point) noexcept
Encode a single code-point.
Definition grapheme.hpp:213
constexpr std::size_t size() const noexcept
Return the number of code-points encoded in the grapheme.
Definition grapheme.hpp:411
Definition unicode_normalization.hpp:22
T addressof(T... args)
T copy(T... args)
T copy_n(T... args)
T operator()(T... args)
T to_string(T... args)
T to_wstring(T... args)