HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
grapheme.hpp
1// Copyright Take Vos 2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "../utility/utility.hpp"
8#include "../i18n/i18n.hpp"
9#include "../telemetry/telemetry.hpp"
10#include "../concurrency/concurrency.hpp"
11#include "../char_maps/char_maps.hpp"
12#include "unicode_normalization.hpp"
13#include "ucd_general_categories.hpp"
14#include "ucd_canonical_combining_classes.hpp"
15#include "ucd_scripts.hpp"
16#include "phrasing.hpp"
17#include "../macros.hpp"
18#include <cstdint>
19#include <string>
20#include <string_view>
21#include <cstddef>
22#include <memory>
23#include <vector>
24#include <algorithm>
25#include <bit>
26#include <array>
27#include <atomic>
28#include <unordered_map>
29#include <mutex>
30#include <chrono>
31#include <format>
32
33hi_export_module(hikogui.unicode.grapheme);
34
35hi_export namespace hi::inline v1 {
36namespace detail {
37
39public:
40 long_grapheme_table() = default;
43 long_grapheme_table& operator=(long_grapheme_table const&) = delete;
44 long_grapheme_table& operator=(long_grapheme_table&&) = delete;
45
54 [[nodiscard]] std::u32string get_grapheme(uint32_t start) const noexcept
55 {
56 // If `start` came from another thread it will have been transferred
57 // to this thread pr
58 auto src = std::addressof(_table[start]);
59 auto const length = *src >> 21;
60
61 auto r = std::u32string{};
62 r.resize_and_overwrite(length, [&](char32_t *dst, size_t count) {
63 std::copy_n(src, count, dst);
64 *dst &= 0x1f'ffff;
65 return count;
66 });
67 return r;
68 }
69
78 [[nodiscard]] size_t get_grapheme_size(uint32_t start) const noexcept
79 {
80 return _table[start] >> 21;
81 }
82
91 [[nodiscard]] char32_t get_grapheme_starter(uint32_t start) const noexcept
92 {
93 return char32_t{_table[start] & 0x1f'ffff};
94 }
95
103 template<typename CodePoints>
104 [[nodiscard]] int32_t add_grapheme(CodePoints&& code_points) noexcept
105 {
106 static_assert(std::is_same_v<typename std::remove_cvref_t<CodePoints>::value_type, char32_t>);
107
108 hi_axiom(code_points.size() >= 2);
109 hi_axiom(unicode_is_NFC_grapheme(code_points.cbegin(), code_points.cend()));
110
111 auto const lock = std::scoped_lock(_mutex);
112
113 // See if this grapheme already exists and return its index.
114 if (auto const it = _indices.find(code_points); it != _indices.end()) {
115 return it->second;
116 }
117
118 // Check if there is enough room in the table to add the code-points.
119 if (_head + code_points.size() >= _table.size()) {
120 return -1;
121 }
122
123 auto const insert_index = _head;
124 _head += narrow_cast<uint32_t>(code_points.size());
125
126 // Copy the grapheme into the table, and set the size on the first entry.
127 std::copy(code_points.cbegin(), code_points.cend(), _table.begin() + insert_index);
128 _table[insert_index] |= char_cast<char32_t>(code_points.size() << 21);
129
130 // Add the grapheme to the quickly searchable index table.
131 _indices.emplace(std::forward<CodePoints>(code_points), insert_index);
132
133 return insert_index;
134 }
135
136private:
137 mutable unfair_mutex _mutex = {};
138 uint32_t _head = {};
139
145 std::array<char32_t, 0x0f'0000> _table = {};
146
148};
149
150inline long_grapheme_table long_graphemes = {};
151
152} // namespace detail
153
154struct composed_t {};
155
167struct grapheme {
168 using value_type = uint64_t;
169
182 value_type _value;
183
184 constexpr grapheme() noexcept = default;
185 constexpr grapheme(grapheme const&) noexcept = default;
186 constexpr grapheme(grapheme&&) noexcept = default;
187 constexpr grapheme& operator=(grapheme const&) noexcept = default;
188 constexpr grapheme& operator=(grapheme&&) noexcept = default;
189
190 constexpr grapheme(std::in_place_t, value_type value) : _value(value) {}
191
192 constexpr value_type& intrinsic() noexcept
193 {
194 return _value;
195 }
196
197 constexpr value_type const& intrinsic() const noexcept
198 {
199 return _value;
200 }
201
204 constexpr grapheme(char32_t code_point) noexcept : _value(char_cast<value_type>(code_point))
205 {
206 hi_axiom(code_point <= 0x10'ffff);
207 hi_axiom(ucd_get_canonical_combining_class(code_point) == 0, "Single code-point must be a starter");
208 set_script();
209 }
210
211 constexpr grapheme(char ascii_char) noexcept : _value(char_cast<value_type>(ascii_char))
212 {
213 hi_axiom(ascii_char >= 0 and ascii_char <= 0x7f);
214 // All ASCII characters are starters.
215 set_script();
216 }
217
220 constexpr grapheme& operator=(char32_t code_point) noexcept
221 {
222 hi_axiom(code_point <= 0x10'ffff);
223 hi_axiom(ucd_get_canonical_combining_class(code_point) == 0, "Single code-point must be a starter");
224
225 _value = char_cast<value_type>(code_point);
226 set_script();
227 return *this;
228 }
229
232 constexpr grapheme& operator=(char ascii_char) noexcept
233 {
234 hi_axiom(ascii_char >= 0 and ascii_char <= 0x7f);
235 // All ASCII characters are starters.
236
237 _value = char_cast<value_type>(ascii_char);
238 set_script();
239 return *this;
240 }
241
246 template<typename CodePoints>
247 constexpr grapheme(composed_t, CodePoints&& code_points) noexcept
248 {
249 static_assert(std::is_same_v<typename std::remove_cvref_t<CodePoints>::value_type, char32_t>);
250
251 hi_axiom(not code_points.empty());
252 if (code_points.size() == 1) {
253 auto const code_point = code_points.front();
254 hi_axiom(code_point <= 0x10'ffff);
255 hi_axiom(ucd_get_canonical_combining_class(code_point) == 0);
256 _value = char_cast<value_type>(code_point);
257
258 } else {
259 auto const index = detail::long_graphemes.add_grapheme(std::forward<CodePoints>(code_points));
260 if (index >= 0) {
261 _value = narrow_cast<value_type>(index + 0x11'0000);
262
263 } else {
264 [[unlikely]] hi_log_error_once(
265 "grapheme::error::too-many", "Too many long graphemes encoded, replacing with U+fffd");
266 _value = char_cast<value_type>(U'\ufffd');
267 }
268 }
269 set_script();
270 }
271
276 constexpr explicit grapheme(std::u32string_view code_points) noexcept :
277 grapheme(composed_t{}, unicode_normalize(code_points, unicode_normalize_config::NFC()))
278 {
279 }
280
283 [[nodiscard]] constexpr uint32_t index() const noexcept
284 {
285 return _value & 0x1f'ffff;
286 }
287
288 [[nodiscard]] constexpr iso_639 language() const noexcept
289 {
290 return iso_639{std::in_place_t{}, narrow_cast<uint16_t>((_value >> 21) & 0x7fff)};
291 }
292
293 constexpr void set_language(iso_639 rhs) noexcept
294 {
295 hi_axiom(rhs.intrinsic() <= 0x7fff);
296
297 constexpr auto mask = ~(value_type{0x7fff} << 21);
298 _value &= mask;
299 _value |= wide_cast<value_type>(rhs.intrinsic()) << 21;
300 }
301
307 [[nodiscard]] constexpr iso_15924 starter_script() const noexcept
308 {
309 return ucd_get_script(starter());
310 }
311
319 [[nodiscard]] constexpr iso_15924 starter_script(iso_15924 default_script) const noexcept
320 {
321 auto const starter_script_ = starter_script();
322 if (starter_script_ == iso_15924::common() and starter_script_ == iso_15924::inherited()) {
323 return default_script;
324 } else {
325 return starter_script_;
326 }
327 }
328
329 [[nodiscard]] constexpr iso_15924 script() const noexcept
330 {
331 return iso_15924{std::in_place_t{}, narrow_cast<uint16_t>((_value >> 36) & 0x3ff)};
332 }
333
339 constexpr void set_script(iso_15924 rhs) noexcept
340 {
341 auto const new_script = starter_script(rhs);
342 hi_axiom(new_script.intrinsic() < 1000);
343
344 constexpr auto mask = ~(value_type{0x3ff} << 36);
345 _value &= mask;
346 _value |= wide_cast<value_type>(new_script.intrinsic()) << 36;
347 }
348
351 constexpr void set_script() noexcept
352 {
353 set_script(starter_script());
354 }
355
356 [[nodiscard]] constexpr iso_3166 region() const noexcept
357 {
358 return iso_3166{std::in_place_t{}, narrow_cast<uint16_t>((_value >> 46) & 0x3ff)};
359 }
360
361 constexpr void set_region(iso_3166 rhs) noexcept
362 {
363 hi_axiom(rhs.intrinsic() < 1000);
364
365 constexpr auto mask = ~(value_type{0x3ff} << 46);
366 _value &= mask;
367 _value |= wide_cast<value_type>(rhs.intrinsic()) << 46;
368 }
369
370 [[nodiscard]] constexpr hi::language_tag language_tag() const noexcept
371 {
372 auto tmp = _value;
373 tmp >>= 21;
374 auto const language_ = iso_639{std::in_place_t{}, narrow_cast<uint16_t>(tmp & 0x7fff)};
375 tmp >>= 15;
376 auto const script_ = iso_15924{std::in_place_t{}, narrow_cast<uint16_t>(tmp & 0x3ff)};
377 tmp >>= 10;
378 auto const region_ = iso_3166{std::in_place_t{}, narrow_cast<uint16_t>(tmp & 0x3ff)};
379 return hi::language_tag{language_, script_, region_};
380 }
381
382 constexpr void set_language_tag(hi::language_tag rhs) noexcept
383 {
384 hi_axiom(rhs.region.intrinsic() < 1000);
385 hi_axiom(rhs.language.intrinsic() <= 0x7fff);
386
387 auto const new_script = starter_script(rhs.script);
388 hi_axiom(new_script.intrinsic() < 1000);
389
390 auto tmp = wide_cast<value_type>(rhs.region.intrinsic());
391 tmp <<= 10;
392 tmp |= new_script.intrinsic();
393 tmp <<= 15;
394 tmp |= rhs.language.intrinsic();
395 tmp <<= 21;
396
397 constexpr auto mask = ~(uint64_t{0x7'ffff'ffff} << 21);
398 _value &= mask;
399 _value |= tmp;
400 }
401
402 [[nodiscard]] constexpr hi::phrasing phrasing() const noexcept
403 {
404 return static_cast<hi::phrasing>((_value >> 56) & 0x3f);
405 }
406
407 constexpr void set_phrasing(hi::phrasing rhs) noexcept
408 {
409 hi_axiom(std::to_underlying(rhs) <= 0x3f);
410
411 constexpr auto mask = ~(value_type{0x3f} << 56);
412 _value &= mask;
413 _value |= static_cast<value_type>(rhs) << 56;
414 }
415
418 [[nodiscard]] constexpr std::size_t size() const noexcept
419 {
420 if (auto i = index(); i <= 0x10'ffff) {
421 return 1_uz;
422 } else {
423 return detail::long_graphemes.get_grapheme_size(i - 0x11'0000);
424 }
425 }
426
427 [[nodiscard]] constexpr char32_t starter() const noexcept
428 {
429 if (auto i = index(); i <= 0x10'ffff) {
430 return char_cast<char32_t>(i);
431 } else {
432 return detail::long_graphemes.get_grapheme_starter(i - 0x11'0000);
433 }
434 }
435
436 [[nodiscard]] constexpr bool is_ascii() const noexcept
437 {
438 return index() <= 127;
439 }
440
443 [[nodiscard]] constexpr std::u32string composed() const noexcept
444 {
445 if (auto const i = index(); i <= 0x10'ffff) {
446 return std::u32string{char_cast<char32_t>(i)};
447 } else {
448 return detail::long_graphemes.get_grapheme(i - 0x11'0000);
449 }
450 }
451
454 [[nodiscard]] constexpr std::u32string
455 decomposed(unicode_normalize_config config = unicode_normalize_config::NFD()) const noexcept
456 {
457 return unicode_decompose(composed(), config);
458 }
459
464 [[nodiscard]] friend constexpr bool operator==(grapheme const& lhs, grapheme const& rhs) noexcept
465 {
466 return lhs.index() == rhs.index();
467 }
468
469 [[nodiscard]] friend constexpr bool operator==(grapheme const& lhs, char32_t const& rhs) noexcept
470 {
471 hi_axiom(char_cast<value_type>(rhs) <= 0x10'ffff);
472 return lhs.index() == char_cast<value_type>(rhs);
473 }
474
475 [[nodiscard]] friend constexpr bool operator==(grapheme const& lhs, char const& rhs) noexcept
476 {
477 hi_axiom(char_cast<value_type>(rhs) <= 0x7f);
478 return lhs.index() == char_cast<value_type>(rhs);
479 }
480
483 [[nodiscard]] friend constexpr std::strong_ordering operator<=>(grapheme const& lhs, grapheme const& rhs) noexcept
484 {
485 return lhs.decomposed() <=> rhs.decomposed();
486 }
487
488 [[nodiscard]] friend constexpr std::strong_ordering operator<=>(grapheme const& lhs, char32_t const& rhs) noexcept
489 {
490 return lhs <=> grapheme{rhs};
491 }
492
493 [[nodiscard]] friend constexpr std::strong_ordering operator<=>(grapheme const& lhs, char const& rhs) noexcept
494 {
495 return lhs <=> grapheme{rhs};
496 }
497
498 [[nodiscard]] friend constexpr std::string to_string(grapheme const& rhs) noexcept
499 {
500 return hi::to_string(rhs.composed());
501 }
502
503 [[nodiscard]] friend constexpr std::wstring to_wstring(grapheme const& rhs) noexcept
504 {
505 return hi::to_wstring(rhs.composed());
506 }
507
508 [[nodiscard]] friend constexpr std::u32string to_u32string(grapheme const& rhs) noexcept
509 {
510 return rhs.composed();
511 }
512};
513
514} // namespace hi::inline v1
515
516template<>
517struct std::hash<hi::grapheme> {
518 [[nodiscard]] std::size_t operator()(hi::grapheme const& rhs) const noexcept
519 {
520 return std::hash<hi::grapheme::value_type>{}(rhs._value);
521 }
522};
constexpr std::u32string to_u32string(std::u32string_view rhs) noexcept
Identity conversion from UTF-32 to UTF-32.
Definition to_string.hpp:28
constexpr std::wstring to_wstring(std::u32string_view rhs) noexcept
Conversion from UTF-32 to wide-string (UTF-16/32).
Definition to_string.hpp:160
@ grapheme
The gui_event has grapheme data.
phrasing
Phrasing.
Definition phrasing.hpp:33
STL namespace.
The HikoGUI namespace.
Definition array_generic.hpp:20
DOXYGEN BUG.
Definition algorithm_misc.hpp:20
constexpr void set_language(It first, ItEnd last, language_tag language) noexcept
Set the language for the string.
Definition gstring.hpp:187
constexpr bool unicode_is_NFC_grapheme(It it, ItEnd last) noexcept
Check if the string of code-points is a single grapheme in NFC normal form.
Definition unicode_normalization.hpp:324
constexpr std::u32string unicode_normalize(std::u32string_view text, unicode_normalize_config config=unicode_normalize_config::NFC()) noexcept
Convert text to a Unicode composed normal form.
Definition unicode_normalization.hpp:308
ISO-3166 country code.
Definition iso_3166_intf.hpp:21
ISO-639 language code.
Definition iso_639.hpp:29
The IETF BCP 47 language tag.
Definition language_tag_intf.hpp:30
Definition grapheme.hpp:38
size_t get_grapheme_size(uint32_t start) const noexcept
Get the size of the grapheme.
Definition grapheme.hpp:78
char32_t get_grapheme_starter(uint32_t start) const noexcept
Get the starter (first) code-point of a grapheme.
Definition grapheme.hpp:91
std::u32string get_grapheme(uint32_t start) const noexcept
Get the grapheme from the table.
Definition grapheme.hpp:54
int32_t add_grapheme(CodePoints &&code_points) noexcept
Find or insert a grapheme in the table.
Definition grapheme.hpp:104
Definition grapheme.hpp:154
A grapheme-cluster, what a user thinks a character is.
Definition grapheme.hpp:167
constexpr grapheme(std::u32string_view code_points) noexcept
Encode a grapheme from a list of code-points.
Definition grapheme.hpp:276
friend constexpr bool operator==(grapheme const &lhs, grapheme const &rhs) noexcept
Compare equivalence of two graphemes.
Definition grapheme.hpp:464
constexpr void set_script() noexcept
Get the script of the grapheme to the starter script.
Definition grapheme.hpp:351
friend constexpr std::strong_ordering operator<=>(grapheme const &lhs, grapheme const &rhs) noexcept
Compare two graphemes lexicographically.
Definition grapheme.hpp:483
constexpr iso_15924 starter_script(iso_15924 default_script) const noexcept
Get the script of the starter code-point.
Definition grapheme.hpp:319
constexpr uint32_t index() const noexcept
Get the codepoint/index part of the grapheme.
Definition grapheme.hpp:283
constexpr grapheme(char32_t code_point) noexcept
Encode a single code-point.
Definition grapheme.hpp:204
constexpr grapheme & operator=(char ascii_char) noexcept
Encode a single code-point.
Definition grapheme.hpp:232
constexpr grapheme(composed_t, CodePoints &&code_points) noexcept
Encode a grapheme from a list of code-points.
Definition grapheme.hpp:247
value_type _value
The grapheme's value.
Definition grapheme.hpp:182
constexpr std::u32string decomposed(unicode_normalize_config config=unicode_normalize_config::NFD()) const noexcept
Get a list of code-point normalized to NFD.
Definition grapheme.hpp:455
constexpr iso_15924 starter_script() const noexcept
Get the script of the starter code-point.
Definition grapheme.hpp:307
constexpr std::u32string composed() const noexcept
Get a list of code-point normalized to NFC.
Definition grapheme.hpp:443
constexpr void set_script(iso_15924 rhs) noexcept
Set the script of the grapheme.
Definition grapheme.hpp:339
constexpr grapheme & operator=(char32_t code_point) noexcept
Encode a single code-point.
Definition grapheme.hpp:220
constexpr std::size_t size() const noexcept
Return the number of code-points encoded in the grapheme.
Definition grapheme.hpp:418
Definition unicode_normalization.hpp:24
T addressof(T... args)
T copy(T... args)
T copy_n(T... args)
T operator()(T... args)
T to_string(T... args)
T to_wstring(T... args)