7#include "../utility/utility.hpp"
8#include "../i18n/i18n.hpp"
9#include "../telemetry/telemetry.hpp"
10#include "../concurrency/concurrency.hpp"
11#include "../char_maps/char_maps.hpp"
12#include "grapheme_attributes.hpp"
13#include "unicode_normalization.hpp"
14#include "ucd_general_categories.hpp"
15#include "ucd_canonical_combining_classes.hpp"
16#include "ucd_scripts.hpp"
17#include "phrasing.hpp"
18#include "../macros.hpp"
29#include <unordered_map>
34hi_export_module(hikogui.unicode : grapheme);
36hi_export
namespace hi::inline
v1 {
39class long_grapheme_table {
41 long_grapheme_table() =
default;
42 long_grapheme_table(long_grapheme_table
const&) =
delete;
43 long_grapheme_table(long_grapheme_table&&) =
delete;
44 long_grapheme_table& operator=(long_grapheme_table
const&) =
delete;
45 long_grapheme_table& operator=(long_grapheme_table&&) =
delete;
60 auto const length = *src >> 21;
63 r.resize_and_overwrite(length, [&](
char32_t *dst,
size_t count) {
81 return _table[start] >> 21;
94 return char32_t{_table[start] & 0x1f'ffff};
104 template<
typename CodePo
ints>
107 static_assert(std::is_same_v<typename std::remove_cvref_t<CodePoints>::value_type,
char32_t>);
109 hi_axiom(code_points.size() >= 2);
112 auto const lock = std::scoped_lock(_mutex);
115 if (
auto const it = _indices.find(code_points); it != _indices.end()) {
120 if (_head + code_points.size() >= _table.size()) {
124 auto const insert_index = _head;
125 _head += narrow_cast<uint32_t>(code_points.size());
128 std::copy(code_points.cbegin(), code_points.cend(), _table.begin() + insert_index);
129 _table[insert_index] |= char_cast<char32_t>(code_points.size() << 21);
138 mutable unfair_mutex _mutex = {};
148 std::unordered_map<std::u32string, uint32_t> _indices = {};
151inline long_grapheme_table long_graphemes = {};
169 using value_type = uint64_t;
185 constexpr grapheme() noexcept = default;
186 constexpr grapheme(grapheme const&) noexcept = default;
187 constexpr grapheme(grapheme&&) noexcept = default;
188 constexpr grapheme& operator=(grapheme const&) noexcept = default;
189 constexpr grapheme& operator=(grapheme&&) noexcept = default;
191 constexpr grapheme(
std::in_place_t, value_type value) :
_value(value) {}
193 constexpr value_type& intrinsic() noexcept
198 constexpr value_type
const& intrinsic() const noexcept
205 constexpr grapheme(
char32_t code_point) noexcept :
_value(char_cast<value_type>(code_point))
207 hi_axiom(code_point <= 0x10'ffff);
208 hi_axiom(ucd_get_canonical_combining_class(code_point) == 0,
"Single code-point must be a starter");
212 constexpr grapheme(
char ascii_char) noexcept : _value(char_cast<value_type>(ascii_char))
214 hi_axiom(ascii_char >= 0 and ascii_char <= 0x7f);
221 constexpr grapheme&
operator=(
char32_t code_point)
noexcept
223 hi_axiom(code_point <= 0x10'ffff);
224 hi_axiom(ucd_get_canonical_combining_class(code_point) == 0,
"Single code-point must be a starter");
226 _value = char_cast<value_type>(code_point);
235 hi_axiom(ascii_char >= 0 and ascii_char <= 0x7f);
238 _value = char_cast<value_type>(ascii_char);
247 template<
typename CodePo
ints>
250 static_assert(std::is_same_v<typename std::remove_cvref_t<CodePoints>::value_type,
char32_t>);
252 hi_axiom(not code_points.empty());
253 if (code_points.size() == 1) {
254 auto const code_point = code_points.front();
255 hi_axiom(code_point <= 0x10'ffff);
256 hi_axiom(ucd_get_canonical_combining_class(code_point) == 0);
257 _value = char_cast<value_type>(code_point);
262 _value = narrow_cast<value_type>(
index + 0x11'0000);
265 [[unlikely]] hi_log_error_once(
266 "grapheme::error::too-many",
"Too many long graphemes encoded, replacing with U+fffd");
267 _value = char_cast<value_type>(U
'\ufffd');
277 constexpr explicit grapheme(std::u32string_view code_points) noexcept :
284 [[nodiscard]]
constexpr uint32_t
index() const noexcept
286 return _value & 0x1f'ffff;
289 [[nodiscard]]
constexpr iso_639 language() const noexcept
291 return iso_639{std::in_place_t{}, narrow_cast<uint16_t>((_value >> 21) & 0x7fff)};
296 hi_axiom(rhs.intrinsic() <= 0x7fff);
298 constexpr auto mask = ~(value_type{0x7fff} << 21);
300 _value |= wide_cast<value_type>(rhs.intrinsic()) << 21;
310 return ucd_get_script(starter());
320 [[nodiscard]]
constexpr iso_15924
starter_script(iso_15924 default_script)
const noexcept
323 if (starter_script_ == iso_15924::common() and starter_script_ == iso_15924::inherited()) {
324 return default_script;
326 return starter_script_;
330 [[nodiscard]]
constexpr iso_15924 script() const noexcept
332 return iso_15924{std::in_place_t{}, narrow_cast<uint16_t>((_value >> 36) & 0x3ff)};
343 hi_axiom(new_script.intrinsic() < 1000);
345 constexpr auto mask = ~(value_type{0x3ff} << 36);
347 _value |= wide_cast<value_type>(new_script.intrinsic()) << 36;
357 [[nodiscard]]
constexpr iso_3166 region() const noexcept
359 return iso_3166{std::in_place_t{}, narrow_cast<uint16_t>((_value >> 46) & 0x3ff)};
362 constexpr void set_region(iso_3166 rhs)
noexcept
364 hi_axiom(rhs.intrinsic() < 1000);
366 constexpr auto mask = ~(value_type{0x3ff} << 46);
368 _value |= wide_cast<value_type>(rhs.intrinsic()) << 46;
371 [[nodiscard]]
constexpr hi::language_tag language_tag() const noexcept
375 auto const language_ = iso_639{std::in_place_t{}, narrow_cast<uint16_t>(tmp & 0x7fff)};
377 auto const script_ = iso_15924{std::in_place_t{}, narrow_cast<uint16_t>(tmp & 0x3ff)};
379 auto const region_ = iso_3166{std::in_place_t{}, narrow_cast<uint16_t>(tmp & 0x3ff)};
380 return hi::language_tag{language_, script_, region_};
383 constexpr void set_language_tag(hi::language_tag rhs)
noexcept
385 hi_axiom(rhs.region.intrinsic() < 1000);
386 hi_axiom(rhs.language.intrinsic() <= 0x7fff);
388 auto const new_script = starter_script(rhs.script);
389 hi_axiom(new_script.intrinsic() < 1000);
391 auto tmp = wide_cast<value_type>(rhs.region.intrinsic());
393 tmp |= new_script.intrinsic();
395 tmp |= rhs.language.intrinsic();
398 constexpr auto mask = ~(uint64_t{0x7'ffff'ffff} << 21);
403 [[nodiscard]]
constexpr hi::phrasing
phrasing() const noexcept
405 return static_cast<hi::phrasing
>((_value >> 56) & 0x3f);
408 constexpr void set_phrasing(hi::phrasing rhs)
noexcept
410 hi_axiom(std::to_underlying(rhs) <= 0x3f);
412 constexpr auto mask = ~(value_type{0x3f} << 56);
414 _value |=
static_cast<value_type
>(rhs) << 56;
417 [[nodiscard]]
constexpr grapheme_attributes attributes() const noexcept
419 auto r = grapheme_attributes{};
423 r.language = iso_639{std::in_place_t{}, narrow_cast<uint16_t>(tmp & 0x7fff)};
425 r.script = iso_15924{std::in_place_t{}, narrow_cast<uint16_t>(tmp & 0x3ff)};
427 r.region = iso_3166{std::in_place_t{}, narrow_cast<uint16_t>(tmp & 0x3ff)};
429 r.phrasing =
static_cast<hi::phrasing
>(tmp & 0x3f);
437 if (
auto i =
index(); i <= 0x10'ffff) {
440 return detail::long_graphemes.get_grapheme_size(i - 0x11'0000);
444 [[nodiscard]]
constexpr char32_t starter() const noexcept
446 if (
auto i = index(); i <= 0x10'ffff) {
447 return char_cast<char32_t>(i);
449 return detail::long_graphemes.get_grapheme_starter(i - 0x11'0000);
453 [[nodiscard]]
constexpr bool is_ascii() const noexcept
455 return index() <= 127;
462 if (
auto const i =
index(); i <= 0x10'ffff) {
465 return detail::long_graphemes.get_grapheme(i - 0x11'0000);
481 [[nodiscard]]
friend constexpr bool operator==(grapheme
const& lhs, grapheme
const& rhs)
noexcept
483 return lhs.index() == rhs.index();
486 [[nodiscard]]
friend constexpr bool operator==(
grapheme const& lhs,
char32_t const& rhs)
noexcept
488 hi_axiom(char_cast<value_type>(rhs) <= 0x10'ffff);
489 return lhs.index() == char_cast<value_type>(rhs);
492 [[nodiscard]]
friend constexpr bool operator==(grapheme
const& lhs,
char const& rhs)
noexcept
494 hi_axiom(char_cast<value_type>(rhs) <= 0x7f);
495 return lhs.index() == char_cast<value_type>(rhs);
500 [[nodiscard]]
friend constexpr std::strong_ordering
operator<=>(grapheme
const& lhs, grapheme
const& rhs)
noexcept
502 return lhs.decomposed() <=> rhs.decomposed();
505 [[nodiscard]]
friend constexpr std::strong_ordering operator<=>(
grapheme const& lhs,
char32_t const& rhs)
noexcept
510 [[nodiscard]]
friend constexpr std::strong_ordering operator<=>(grapheme
const& lhs,
char const& rhs)
noexcept
515 [[nodiscard]]
friend constexpr std::string
to_string(grapheme
const& rhs)
noexcept
517 return hi::to_string(rhs.composed());
520 [[nodiscard]]
friend constexpr std::wstring
to_wstring(grapheme
const& rhs)
noexcept
525 [[nodiscard]]
friend constexpr std::u32string
to_u32string(grapheme
const& rhs)
noexcept
527 return rhs.composed();
constexpr std::u32string to_u32string(std::u32string_view rhs) noexcept
Identity conversion from UTF-32 to UTF-32.
Definition to_string.hpp:28
constexpr std::wstring to_wstring(std::u32string_view rhs) noexcept
Conversion from UTF-32 to wide-string (UTF-16/32).
Definition to_string.hpp:160
@ grapheme
The gui_event has grapheme data.
Definition gui_event_variant.hpp:40
phrasing
Phrasing.
Definition phrasing.hpp:31
The HikoGUI namespace.
Definition array_generic.hpp:21
DOXYGEN BUG.
Definition algorithm_misc.hpp:20
constexpr void set_language(It first, ItEnd last, language_tag language) noexcept
Set the language for the string.
Definition gstring.hpp:187
constexpr std::u32string unicode_decompose(std::u32string_view text, unicode_normalize_config config=unicode_normalize_config::NFD()) noexcept
Convert text to a Unicode decomposed normal form.
Definition unicode_normalization.hpp:293
constexpr bool unicode_is_NFC_grapheme(It it, ItEnd last) noexcept
Check if the string of code-points is a single grapheme in NFC normal form.
Definition unicode_normalization.hpp:324
constexpr std::u32string unicode_normalize(std::u32string_view text, unicode_normalize_config config=unicode_normalize_config::NFC()) noexcept
Convert text to a Unicode composed normal form.
Definition unicode_normalization.hpp:308
ISO-3166 country code.
Definition iso_3166_intf.hpp:21
ISO-639 language code.
Definition iso_639.hpp:29
size_t get_grapheme_size(uint32_t start) const noexcept
Get the size of the grapheme.
Definition grapheme.hpp:79
char32_t get_grapheme_starter(uint32_t start) const noexcept
Get the starter (first) code-point of a grapheme.
Definition grapheme.hpp:92
std::u32string get_grapheme(uint32_t start) const noexcept
Get the grapheme from the table.
Definition grapheme.hpp:55
int32_t add_grapheme(CodePoints &&code_points) noexcept
Find or insert a grapheme in the table.
Definition grapheme.hpp:105
Definition grapheme.hpp:155
A grapheme-cluster, what a user thinks a character is.
Definition grapheme.hpp:168
constexpr grapheme(std::u32string_view code_points) noexcept
Encode a grapheme from a list of code-points.
Definition grapheme.hpp:277
friend constexpr bool operator==(grapheme const &lhs, grapheme const &rhs) noexcept
Compare equivalence of two graphemes.
Definition grapheme.hpp:481
constexpr void set_script() noexcept
Get the script of the grapheme to the starter script.
Definition grapheme.hpp:352
friend constexpr std::strong_ordering operator<=>(grapheme const &lhs, grapheme const &rhs) noexcept
Compare two graphemes lexicographically.
Definition grapheme.hpp:500
constexpr iso_15924 starter_script(iso_15924 default_script) const noexcept
Get the script of the starter code-point.
Definition grapheme.hpp:320
constexpr uint32_t index() const noexcept
Get the codepoint/index part of the grapheme.
Definition grapheme.hpp:284
constexpr grapheme(char32_t code_point) noexcept
Encode a single code-point.
Definition grapheme.hpp:205
constexpr grapheme & operator=(char ascii_char) noexcept
Encode a single code-point.
Definition grapheme.hpp:233
constexpr grapheme(composed_t, CodePoints &&code_points) noexcept
Encode a grapheme from a list of code-points.
Definition grapheme.hpp:248
value_type _value
The grapheme's value.
Definition grapheme.hpp:183
constexpr std::u32string decomposed(unicode_normalize_config config=unicode_normalize_config::NFD()) const noexcept
Get a list of code-point normalized to NFD.
Definition grapheme.hpp:472
constexpr iso_15924 starter_script() const noexcept
Get the script of the starter code-point.
Definition grapheme.hpp:308
constexpr std::u32string composed() const noexcept
Get a list of code-point normalized to NFC.
Definition grapheme.hpp:460
constexpr void set_script(iso_15924 rhs) noexcept
Set the script of the grapheme.
Definition grapheme.hpp:340
constexpr grapheme & operator=(char32_t code_point) noexcept
Encode a single code-point.
Definition grapheme.hpp:221
constexpr std::size_t size() const noexcept
Return the number of code-points encoded in the grapheme.
Definition grapheme.hpp:435
Definition unicode_normalization.hpp:24