HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
grapheme.hpp
1// Copyright Take Vos 2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "../utility/utility.hpp"
8#include "../i18n/i18n.hpp"
9#include "../telemetry/telemetry.hpp"
10#include "../concurrency/concurrency.hpp"
11#include "../char_maps/char_maps.hpp"
12#include "grapheme_attributes.hpp"
13#include "unicode_normalization.hpp"
14#include "ucd_general_categories.hpp"
15#include "ucd_canonical_combining_classes.hpp"
16#include "ucd_scripts.hpp"
17#include "phrasing.hpp"
18#include "../macros.hpp"
19#include <cstdint>
20#include <string>
21#include <string_view>
22#include <cstddef>
23#include <memory>
24#include <vector>
25#include <algorithm>
26#include <bit>
27#include <array>
28#include <atomic>
29#include <unordered_map>
30#include <mutex>
31#include <chrono>
32#include <format>
33
34hi_export_module(hikogui.unicode : grapheme);
35
36hi_export namespace hi::inline v1 {
37namespace detail {
38
39class long_grapheme_table {
40public:
41 long_grapheme_table() = default;
42 long_grapheme_table(long_grapheme_table const&) = delete;
43 long_grapheme_table(long_grapheme_table&&) = delete;
44 long_grapheme_table& operator=(long_grapheme_table const&) = delete;
45 long_grapheme_table& operator=(long_grapheme_table&&) = delete;
46
55 [[nodiscard]] std::u32string get_grapheme(uint32_t start) const noexcept
56 {
57 // If `start` came from another thread it will have been transferred
58 // to this thread pr
59 auto src = std::addressof(_table[start]);
60 auto const length = *src >> 21;
61
62 auto r = std::u32string{};
63 r.resize_and_overwrite(length, [&](char32_t *dst, size_t count) {
64 std::copy_n(src, count, dst);
65 *dst &= 0x1f'ffff;
66 return count;
67 });
68 return r;
69 }
70
79 [[nodiscard]] size_t get_grapheme_size(uint32_t start) const noexcept
80 {
81 return _table[start] >> 21;
82 }
83
92 [[nodiscard]] char32_t get_grapheme_starter(uint32_t start) const noexcept
93 {
94 return char32_t{_table[start] & 0x1f'ffff};
95 }
96
104 template<typename CodePoints>
105 [[nodiscard]] int32_t add_grapheme(CodePoints&& code_points) noexcept
106 {
107 static_assert(std::is_same_v<typename std::remove_cvref_t<CodePoints>::value_type, char32_t>);
108
109 hi_axiom(code_points.size() >= 2);
110 hi_axiom(unicode_is_NFC_grapheme(code_points.cbegin(), code_points.cend()));
111
112 auto const lock = std::scoped_lock(_mutex);
113
114 // See if this grapheme already exists and return its index.
115 if (auto const it = _indices.find(code_points); it != _indices.end()) {
116 return it->second;
117 }
118
119 // Check if there is enough room in the table to add the code-points.
120 if (_head + code_points.size() >= _table.size()) {
121 return -1;
122 }
123
124 auto const insert_index = _head;
125 _head += narrow_cast<uint32_t>(code_points.size());
126
127 // Copy the grapheme into the table, and set the size on the first entry.
128 std::copy(code_points.cbegin(), code_points.cend(), _table.begin() + insert_index);
129 _table[insert_index] |= char_cast<char32_t>(code_points.size() << 21);
130
131 // Add the grapheme to the quickly searchable index table.
132 _indices.emplace(std::forward<CodePoints>(code_points), insert_index);
133
134 return insert_index;
135 }
136
137private:
138 mutable unfair_mutex _mutex = {};
139 uint32_t _head = {};
140
146 std::array<char32_t, 0x0f'0000> _table = {};
147
148 std::unordered_map<std::u32string, uint32_t> _indices = {};
149};
150
151inline long_grapheme_table long_graphemes = {};
152
153} // namespace detail
154
155struct composed_t {};
156
168struct grapheme {
169 using value_type = uint64_t;
170
183 value_type _value;
184
185 constexpr grapheme() noexcept = default;
186 constexpr grapheme(grapheme const&) noexcept = default;
187 constexpr grapheme(grapheme&&) noexcept = default;
188 constexpr grapheme& operator=(grapheme const&) noexcept = default;
189 constexpr grapheme& operator=(grapheme&&) noexcept = default;
190
191 constexpr grapheme(std::in_place_t, value_type value) : _value(value) {}
192
193 constexpr value_type& intrinsic() noexcept
194 {
195 return _value;
196 }
197
198 constexpr value_type const& intrinsic() const noexcept
199 {
200 return _value;
201 }
202
205 constexpr grapheme(char32_t code_point) noexcept : _value(char_cast<value_type>(code_point))
206 {
207 hi_axiom(code_point <= 0x10'ffff);
208 hi_axiom(ucd_get_canonical_combining_class(code_point) == 0, "Single code-point must be a starter");
209 set_script();
210 }
211
212 constexpr grapheme(char ascii_char) noexcept : _value(char_cast<value_type>(ascii_char))
213 {
214 hi_axiom(ascii_char >= 0 and ascii_char <= 0x7f);
215 // All ASCII characters are starters.
216 set_script();
217 }
218
221 constexpr grapheme& operator=(char32_t code_point) noexcept
222 {
223 hi_axiom(code_point <= 0x10'ffff);
224 hi_axiom(ucd_get_canonical_combining_class(code_point) == 0, "Single code-point must be a starter");
225
226 _value = char_cast<value_type>(code_point);
227 set_script();
228 return *this;
229 }
230
233 constexpr grapheme& operator=(char ascii_char) noexcept
234 {
235 hi_axiom(ascii_char >= 0 and ascii_char <= 0x7f);
236 // All ASCII characters are starters.
237
238 _value = char_cast<value_type>(ascii_char);
239 set_script();
240 return *this;
241 }
242
247 template<typename CodePoints>
248 constexpr grapheme(composed_t, CodePoints&& code_points) noexcept
249 {
250 static_assert(std::is_same_v<typename std::remove_cvref_t<CodePoints>::value_type, char32_t>);
251
252 hi_axiom(not code_points.empty());
253 if (code_points.size() == 1) {
254 auto const code_point = code_points.front();
255 hi_axiom(code_point <= 0x10'ffff);
256 hi_axiom(ucd_get_canonical_combining_class(code_point) == 0);
257 _value = char_cast<value_type>(code_point);
258
259 } else {
260 auto const index = detail::long_graphemes.add_grapheme(std::forward<CodePoints>(code_points));
261 if (index >= 0) {
262 _value = narrow_cast<value_type>(index + 0x11'0000);
263
264 } else {
265 [[unlikely]] hi_log_error_once(
266 "grapheme::error::too-many", "Too many long graphemes encoded, replacing with U+fffd");
267 _value = char_cast<value_type>(U'\ufffd');
268 }
269 }
270 set_script();
271 }
272
277 constexpr explicit grapheme(std::u32string_view code_points) noexcept :
278 grapheme(composed_t{}, unicode_normalize(code_points, unicode_normalize_config::NFC()))
279 {
280 }
281
284 [[nodiscard]] constexpr uint32_t index() const noexcept
285 {
286 return _value & 0x1f'ffff;
287 }
288
289 [[nodiscard]] constexpr iso_639 language() const noexcept
290 {
291 return iso_639{std::in_place_t{}, narrow_cast<uint16_t>((_value >> 21) & 0x7fff)};
292 }
293
294 constexpr void set_language(iso_639 rhs) noexcept
295 {
296 hi_axiom(rhs.intrinsic() <= 0x7fff);
297
298 constexpr auto mask = ~(value_type{0x7fff} << 21);
299 _value &= mask;
300 _value |= wide_cast<value_type>(rhs.intrinsic()) << 21;
301 }
302
308 [[nodiscard]] constexpr iso_15924 starter_script() const noexcept
309 {
310 return ucd_get_script(starter());
311 }
312
320 [[nodiscard]] constexpr iso_15924 starter_script(iso_15924 default_script) const noexcept
321 {
322 auto const starter_script_ = starter_script();
323 if (starter_script_ == iso_15924::common() and starter_script_ == iso_15924::inherited()) {
324 return default_script;
325 } else {
326 return starter_script_;
327 }
328 }
329
330 [[nodiscard]] constexpr iso_15924 script() const noexcept
331 {
332 return iso_15924{std::in_place_t{}, narrow_cast<uint16_t>((_value >> 36) & 0x3ff)};
333 }
334
340 constexpr void set_script(iso_15924 rhs) noexcept
341 {
342 auto const new_script = starter_script(rhs);
343 hi_axiom(new_script.intrinsic() < 1000);
344
345 constexpr auto mask = ~(value_type{0x3ff} << 36);
346 _value &= mask;
347 _value |= wide_cast<value_type>(new_script.intrinsic()) << 36;
348 }
349
352 constexpr void set_script() noexcept
353 {
355 }
356
357 [[nodiscard]] constexpr iso_3166 region() const noexcept
358 {
359 return iso_3166{std::in_place_t{}, narrow_cast<uint16_t>((_value >> 46) & 0x3ff)};
360 }
361
362 constexpr void set_region(iso_3166 rhs) noexcept
363 {
364 hi_axiom(rhs.intrinsic() < 1000);
365
366 constexpr auto mask = ~(value_type{0x3ff} << 46);
367 _value &= mask;
368 _value |= wide_cast<value_type>(rhs.intrinsic()) << 46;
369 }
370
371 [[nodiscard]] constexpr hi::language_tag language_tag() const noexcept
372 {
373 auto tmp = _value;
374 tmp >>= 21;
375 auto const language_ = iso_639{std::in_place_t{}, narrow_cast<uint16_t>(tmp & 0x7fff)};
376 tmp >>= 15;
377 auto const script_ = iso_15924{std::in_place_t{}, narrow_cast<uint16_t>(tmp & 0x3ff)};
378 tmp >>= 10;
379 auto const region_ = iso_3166{std::in_place_t{}, narrow_cast<uint16_t>(tmp & 0x3ff)};
380 return hi::language_tag{language_, script_, region_};
381 }
382
383 constexpr void set_language_tag(hi::language_tag rhs) noexcept
384 {
385 hi_axiom(rhs.region.intrinsic() < 1000);
386 hi_axiom(rhs.language.intrinsic() <= 0x7fff);
387
388 auto const new_script = starter_script(rhs.script);
389 hi_axiom(new_script.intrinsic() < 1000);
390
391 auto tmp = wide_cast<value_type>(rhs.region.intrinsic());
392 tmp <<= 10;
393 tmp |= new_script.intrinsic();
394 tmp <<= 15;
395 tmp |= rhs.language.intrinsic();
396 tmp <<= 21;
397
398 constexpr auto mask = ~(uint64_t{0x7'ffff'ffff} << 21);
399 _value &= mask;
400 _value |= tmp;
401 }
402
403 [[nodiscard]] constexpr hi::phrasing phrasing() const noexcept
404 {
405 return static_cast<hi::phrasing>((_value >> 56) & 0x3f);
406 }
407
408 constexpr void set_phrasing(hi::phrasing rhs) noexcept
409 {
410 hi_axiom(std::to_underlying(rhs) <= 0x3f);
411
412 constexpr auto mask = ~(value_type{0x3f} << 56);
413 _value &= mask;
414 _value |= static_cast<value_type>(rhs) << 56;
415 }
416
417 [[nodiscard]] constexpr grapheme_attributes attributes() const noexcept
418 {
419 auto r = grapheme_attributes{};
420
421 auto tmp = _value;
422 tmp >>= 21;
423 r.language = iso_639{std::in_place_t{}, narrow_cast<uint16_t>(tmp & 0x7fff)};
424 tmp >>= 15;
425 r.script = iso_15924{std::in_place_t{}, narrow_cast<uint16_t>(tmp & 0x3ff)};
426 tmp >>= 10;
427 r.region = iso_3166{std::in_place_t{}, narrow_cast<uint16_t>(tmp & 0x3ff)};
428 tmp >>= 10;
429 r.phrasing = static_cast<hi::phrasing>(tmp & 0x3f);
430 return r;
431 }
432
435 [[nodiscard]] constexpr std::size_t size() const noexcept
436 {
437 if (auto i = index(); i <= 0x10'ffff) {
438 return 1_uz;
439 } else {
440 return detail::long_graphemes.get_grapheme_size(i - 0x11'0000);
441 }
442 }
443
444 [[nodiscard]] constexpr char32_t starter() const noexcept
445 {
446 if (auto i = index(); i <= 0x10'ffff) {
447 return char_cast<char32_t>(i);
448 } else {
449 return detail::long_graphemes.get_grapheme_starter(i - 0x11'0000);
450 }
451 }
452
453 [[nodiscard]] constexpr bool is_ascii() const noexcept
454 {
455 return index() <= 127;
456 }
457
460 [[nodiscard]] constexpr std::u32string composed() const noexcept
461 {
462 if (auto const i = index(); i <= 0x10'ffff) {
463 return std::u32string{char_cast<char32_t>(i)};
464 } else {
465 return detail::long_graphemes.get_grapheme(i - 0x11'0000);
466 }
467 }
468
471 [[nodiscard]] constexpr std::u32string
472 decomposed(unicode_normalize_config config = unicode_normalize_config::NFD()) const noexcept
473 {
474 return unicode_decompose(composed(), config);
475 }
476
481 [[nodiscard]] friend constexpr bool operator==(grapheme const& lhs, grapheme const& rhs) noexcept
482 {
483 return lhs.index() == rhs.index();
484 }
485
486 [[nodiscard]] friend constexpr bool operator==(grapheme const& lhs, char32_t const& rhs) noexcept
487 {
488 hi_axiom(char_cast<value_type>(rhs) <= 0x10'ffff);
489 return lhs.index() == char_cast<value_type>(rhs);
490 }
491
492 [[nodiscard]] friend constexpr bool operator==(grapheme const& lhs, char const& rhs) noexcept
493 {
494 hi_axiom(char_cast<value_type>(rhs) <= 0x7f);
495 return lhs.index() == char_cast<value_type>(rhs);
496 }
497
500 [[nodiscard]] friend constexpr std::strong_ordering operator<=>(grapheme const& lhs, grapheme const& rhs) noexcept
501 {
502 return lhs.decomposed() <=> rhs.decomposed();
503 }
504
505 [[nodiscard]] friend constexpr std::strong_ordering operator<=>(grapheme const& lhs, char32_t const& rhs) noexcept
506 {
507 return lhs <=> grapheme{rhs};
508 }
509
510 [[nodiscard]] friend constexpr std::strong_ordering operator<=>(grapheme const& lhs, char const& rhs) noexcept
511 {
512 return lhs <=> grapheme{rhs};
513 }
514
515 [[nodiscard]] friend constexpr std::string to_string(grapheme const& rhs) noexcept
516 {
517 return hi::to_string(rhs.composed());
518 }
519
520 [[nodiscard]] friend constexpr std::wstring to_wstring(grapheme const& rhs) noexcept
521 {
522 return hi::to_wstring(rhs.composed());
523 }
524
525 [[nodiscard]] friend constexpr std::u32string to_u32string(grapheme const& rhs) noexcept
526 {
527 return rhs.composed();
528 }
529};
530
531} // namespace hi::inline v1
532
533template<>
534struct std::hash<hi::grapheme> {
535 [[nodiscard]] std::size_t operator()(hi::grapheme const& rhs) const noexcept
536 {
537 return std::hash<hi::grapheme::value_type>{}(rhs._value);
538 }
539};
constexpr std::u32string to_u32string(std::u32string_view rhs) noexcept
Identity conversion from UTF-32 to UTF-32.
Definition to_string.hpp:28
constexpr std::wstring to_wstring(std::u32string_view rhs) noexcept
Conversion from UTF-32 to wide-string (UTF-16/32).
Definition to_string.hpp:160
@ grapheme
The gui_event has grapheme data.
Definition gui_event_variant.hpp:40
phrasing
Phrasing.
Definition phrasing.hpp:31
STL namespace.
The HikoGUI namespace.
Definition array_generic.hpp:21
DOXYGEN BUG.
Definition algorithm_misc.hpp:20
constexpr void set_language(It first, ItEnd last, language_tag language) noexcept
Set the language for the string.
Definition gstring.hpp:187
constexpr std::u32string unicode_decompose(std::u32string_view text, unicode_normalize_config config=unicode_normalize_config::NFD()) noexcept
Convert text to a Unicode decomposed normal form.
Definition unicode_normalization.hpp:293
constexpr bool unicode_is_NFC_grapheme(It it, ItEnd last) noexcept
Check if the string of code-points is a single grapheme in NFC normal form.
Definition unicode_normalization.hpp:324
constexpr std::u32string unicode_normalize(std::u32string_view text, unicode_normalize_config config=unicode_normalize_config::NFC()) noexcept
Convert text to a Unicode composed normal form.
Definition unicode_normalization.hpp:308
ISO-3166 country code.
Definition iso_3166_intf.hpp:21
ISO-639 language code.
Definition iso_639.hpp:29
size_t get_grapheme_size(uint32_t start) const noexcept
Get the size of the grapheme.
Definition grapheme.hpp:79
char32_t get_grapheme_starter(uint32_t start) const noexcept
Get the starter (first) code-point of a grapheme.
Definition grapheme.hpp:92
std::u32string get_grapheme(uint32_t start) const noexcept
Get the grapheme from the table.
Definition grapheme.hpp:55
int32_t add_grapheme(CodePoints &&code_points) noexcept
Find or insert a grapheme in the table.
Definition grapheme.hpp:105
Definition grapheme.hpp:155
A grapheme-cluster, what a user thinks a character is.
Definition grapheme.hpp:168
constexpr grapheme(std::u32string_view code_points) noexcept
Encode a grapheme from a list of code-points.
Definition grapheme.hpp:277
friend constexpr bool operator==(grapheme const &lhs, grapheme const &rhs) noexcept
Compare equivalence of two graphemes.
Definition grapheme.hpp:481
constexpr void set_script() noexcept
Get the script of the grapheme to the starter script.
Definition grapheme.hpp:352
friend constexpr std::strong_ordering operator<=>(grapheme const &lhs, grapheme const &rhs) noexcept
Compare two graphemes lexicographically.
Definition grapheme.hpp:500
constexpr iso_15924 starter_script(iso_15924 default_script) const noexcept
Get the script of the starter code-point.
Definition grapheme.hpp:320
constexpr uint32_t index() const noexcept
Get the codepoint/index part of the grapheme.
Definition grapheme.hpp:284
constexpr grapheme(char32_t code_point) noexcept
Encode a single code-point.
Definition grapheme.hpp:205
constexpr grapheme & operator=(char ascii_char) noexcept
Encode a single code-point.
Definition grapheme.hpp:233
constexpr grapheme(composed_t, CodePoints &&code_points) noexcept
Encode a grapheme from a list of code-points.
Definition grapheme.hpp:248
value_type _value
The grapheme's value.
Definition grapheme.hpp:183
constexpr std::u32string decomposed(unicode_normalize_config config=unicode_normalize_config::NFD()) const noexcept
Get a list of code-point normalized to NFD.
Definition grapheme.hpp:472
constexpr iso_15924 starter_script() const noexcept
Get the script of the starter code-point.
Definition grapheme.hpp:308
constexpr std::u32string composed() const noexcept
Get a list of code-point normalized to NFC.
Definition grapheme.hpp:460
constexpr void set_script(iso_15924 rhs) noexcept
Set the script of the grapheme.
Definition grapheme.hpp:340
constexpr grapheme & operator=(char32_t code_point) noexcept
Encode a single code-point.
Definition grapheme.hpp:221
constexpr std::size_t size() const noexcept
Return the number of code-points encoded in the grapheme.
Definition grapheme.hpp:435
Definition unicode_normalization.hpp:24
T addressof(T... args)
T copy(T... args)
T copy_n(T... args)
T forward(T... args)
T operator()(T... args)
T to_string(T... args)
T to_wstring(T... args)