HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
unicode_grapheme_cluster_break.hpp
1// Copyright Take Vos 2020.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "ucd_grapheme_cluster_breaks.hpp"
8#include "unicode_break_opportunity.hpp"
9#include "../macros.hpp"
10#include <cstdint>
11
12namespace hi { inline namespace v1 {
13namespace detail {
14
16 unicode_grapheme_cluster_break previous = unicode_grapheme_cluster_break::Other;
17 int RI_count = 0;
18 bool first_character = true;
19 bool in_extended_pictograph = false;
20
21 constexpr void reset() noexcept
22 {
23 previous = unicode_grapheme_cluster_break::Other;
24 RI_count = 0;
25 first_character = true;
26 in_extended_pictograph = false;
27 }
28};
29
38[[nodiscard]] constexpr bool breaks_grapheme(unicode_grapheme_cluster_break cluster_break, grapheme_break_state& state) noexcept
39{
40 using enum unicode_grapheme_cluster_break;
41
42 hilet lhs = state.previous;
43 hilet rhs = cluster_break;
44
45 enum class break_state {
46 unknown,
49 };
50
51 break_state break_state = break_state::unknown;
52
53 // GB1, GB2: Break at the start and end of text, unless the text is empty.
54 bool GB1 = state.first_character;
55 if ((break_state == break_state::unknown) & GB1) {
56 break_state = break_state::do_break;
57 }
58
59 state.first_character = false;
60
61 // GB3, GB4, GB5: Do not break between a CR and LF. Otherwise, break before and after controls.
62 hilet GB3 = (lhs == CR) && (rhs == LF);
63 hilet GB4 = (lhs == Control) || (lhs == CR) || (lhs == LF);
64 hilet GB5 = (rhs == Control) || (rhs == CR) || (rhs == LF);
65 if (break_state == break_state::unknown) {
66 if (GB3) {
67 break_state = break_state::dont_break;
68 } else if (GB4 || GB5) {
69 break_state = break_state::do_break;
70 }
71 }
72
73 // GB6, GB7, GB8: Do not break Hangul syllable sequences.
74 hilet GB6 = (lhs == L) && ((rhs == L) || (rhs == V) || (rhs == LV) | (rhs == LVT));
75 hilet GB7 = ((lhs == LV) || (lhs == V)) && ((rhs == V) || (rhs == T));
76 hilet GB8 = ((lhs == LVT) || (lhs == T)) && (rhs == T);
77 if ((break_state == break_state::unknown) && (GB6 || GB7 || GB8)) {
78 break_state = break_state::dont_break;
79 }
80
81 // GB9: Do not break before extending characters or ZWJ.
82 hilet GB9 = ((rhs == Extend) || (rhs == ZWJ));
83
84 // GB9a, GB9b: Do not break before SpacingMarks, or after Prepend characters.
85 // Both rules only apply to extended grapheme clusters.
86 hilet GB9a = (rhs == SpacingMark);
87 hilet GB9b = (lhs == Prepend);
88 if ((break_state == break_state::unknown) & (GB9 || GB9a || GB9b)) {
89 break_state = break_state::dont_break;
90 }
91
92 // GB11: Do not break within emoji modifier sequences or emoji zwj sequences.
93 hilet GB11 = state.in_extended_pictograph && (lhs == ZWJ) && (rhs == Extended_Pictographic);
94 if ((break_state == break_state::unknown) && GB11) {
95 break_state = break_state::dont_break;
96 }
97
98 if (rhs == Extended_Pictographic) {
99 state.in_extended_pictograph = true;
100 } else if (!((rhs == Extend) || (rhs == ZWJ))) {
101 state.in_extended_pictograph = false;
102 }
103
104 // GB12, GB13: Do not break within emoji flag sequences.
105 // That is, do not break between regional indicator (RI) symbols,
106 // if there is an odd number of RI characters before the break point.
107 hilet GB12_13 = (lhs == Regional_Indicator) && (rhs == Regional_Indicator) && ((state.RI_count % 2) == 1);
108 if ((break_state == break_state::unknown) && (GB12_13)) {
109 break_state = break_state::dont_break;
110 }
111
112 if (rhs == Regional_Indicator) {
113 state.RI_count++;
114 } else {
115 state.RI_count = 0;
116 }
117
118 // GB999: Otherwise, break everywhere.
119 if (break_state == break_state::unknown) {
120 break_state = break_state::do_break;
121 }
122
123 state.previous = rhs;
124 return break_state == break_state::do_break;
125}
126
134[[nodiscard]] constexpr bool breaks_grapheme(char32_t code_point, grapheme_break_state& state) noexcept
135{
136 return breaks_grapheme(ucd_get_grapheme_cluster_break(code_point), state);
137}
138
139} // namespace detail
140
141template<typename It, typename ItEnd>
142[[nodiscard]] constexpr std::vector<unicode_break_opportunity> unicode_grapheme_break(It first, ItEnd last) noexcept
143{
145 auto state = detail::grapheme_break_state{};
146
147 for (auto it = first; it != last; ++it) {
148 hilet opportunity = detail::breaks_grapheme(*it, state) ? unicode_break_opportunity::yes : unicode_break_opportunity::no;
149 r.push_back(opportunity);
150 }
151
152 r.push_back(unicode_break_opportunity::yes);
153 return r;
154}
155
156}} // namespace hi::v1
DOXYGEN BUG.
Definition algorithm.hpp:16
geometry/margins.hpp
Definition lookahead_iterator.hpp:5
constexpr Out narrow_cast(In const &rhs) noexcept
Cast numeric values without loss of precision.
Definition cast.hpp:377
Definition unicode_grapheme_cluster_break.hpp:15