HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
unicode_sentence_break.hpp
Go to the documentation of this file.
1// Copyright Take Vos 2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
8#pragma once
9
10#include <tuple>
11
12namespace hi::inline v1 {
13
14enum class unicode_sentence_break_property : uint8_t {
15 Other,
16 CR,
17 LF,
18 Extend,
19 Sep,
20 Format,
21 Sp,
22 Lower,
23 Upper,
24 OLetter,
25 Numeric,
26 ATerm,
27 SContinue,
28 STerm,
29 Close
30};
31
32namespace detail {
33
35public:
36 constexpr unicode_sentence_break_info() noexcept : _value(0)
37 {}
38 constexpr unicode_sentence_break_info(unicode_sentence_break_info const &) noexcept = default;
39 constexpr unicode_sentence_break_info(unicode_sentence_break_info &&) noexcept = default;
40 constexpr unicode_sentence_break_info &operator=(unicode_sentence_break_info const &) noexcept = default;
41 constexpr unicode_sentence_break_info &operator=(unicode_sentence_break_info &&) noexcept = default;
42
43 constexpr unicode_sentence_break_info(unicode_sentence_break_property const &sentence_break_property) noexcept : _value(to_underlying(sentence_break_property))
44 {}
45
46 constexpr unicode_sentence_break_info &make_skip() noexcept
47 {
48 _value |= 0x40;
49 return *this;
50 }
51
52 [[nodiscard]] constexpr bool is_skip() const noexcept
53 {
54 return to_bool(_value & 0x40);
55 }
56
57 [[nodiscard]] constexpr friend bool operator==(unicode_sentence_break_info const &lhs, unicode_sentence_break_property const &rhs) noexcept
58 {
59 return (lhs._value & 0x3f) == to_underlying(rhs);
60 }
61
62 [[nodiscard]] constexpr friend bool operator==(unicode_sentence_break_info const &, unicode_sentence_break_info const &) noexcept = default;
63
64 [[nodiscard]] constexpr friend bool is_ParaSep(unicode_sentence_break_info const &rhs) noexcept
65 {
66 return rhs == unicode_sentence_break_property::Sep or rhs == unicode_sentence_break_property::CR or rhs == unicode_sentence_break_property::LF;
67 }
68
69 [[nodiscard]] constexpr friend bool is_SATerm(unicode_sentence_break_info const &rhs) noexcept
70 {
71 return rhs == unicode_sentence_break_property::STerm or rhs == unicode_sentence_break_property::ATerm;
72 }
73
74private:
75 uint8_t _value;
76};
77
78[[nodiscard]] inline void unicode_sentence_break_SB1_SB4(
81{
82 using enum unicode_break_opportunity;
83 using enum unicode_sentence_break_property;
84
85 hi_axiom(r.size() == infos.size() + 1);
86
87 r.front() = yes; // SB1
88 r.back() = yes; // SB2
89
90 for (auto i = 1_uz; i < infos.size(); ++i) {
91 hilet prev = infos[i - 1];
92 hilet next = infos[i];
93
94 r[i] = [&] () {
95 if (prev == CR and next == LF) {
96 return no; // SB3
97 } else if (is_ParaSep(prev)) {
98 return yes; //SB4
99 } else {
100 return unassigned;
101 }
102 }();
103 }
104}
105
106[[nodiscard]] inline void unicode_sentence_break_SB5(
107 unicode_break_vector &r,
109{
110 using enum unicode_break_opportunity;
111 using enum unicode_sentence_break_property;
112
113 hi_axiom(r.size() == infos.size() + 1);
114
115 for (auto i = 1_uz; i < infos.size(); ++i) {
116 hilet prev = infos[i - 1];
117 auto &next = infos[i];
118
119 if ((not is_ParaSep(prev) and prev != CR and prev != LF) and (next == Extend or next == Format)) {
120 if (r[i] == unassigned) {
121 r[i] = no;
122 }
123 next.make_skip();
124 }
125 }
126}
127
128[[nodiscard]] inline void unicode_sentence_break_SB6_SB998(
129 unicode_break_vector &r,
131{
132 using enum unicode_break_opportunity;
133 using enum unicode_sentence_break_property;
134
135 hi_axiom(r.size() == infos.size() + 1);
136
137 for (auto i = 0_z; i < std::ssize(infos); ++i) {
138 hilet &next = infos[i];
139 if (r[i] != unassigned) {
140 continue;
141 }
142
143 hi_axiom(not next.is_skip());
144
146
147 hilet prev = [&] {
148 for (k = i - 1; k >= 0; --k) {
149 if (not infos[k].is_skip()) {
150 return infos[k];
151 }
152 }
153 return unicode_sentence_break_info{};
154 }();
155
156 hilet prev_prev = [&] {
157 for (--k; k >= 0; --k) {
158 if (not infos[k].is_skip()) {
159 return infos[k];
160 }
161 }
162 return unicode_sentence_break_info{};
163 }();
164
165 // close_sp
166 // 0 - no suffix
167 // 1 - ends in ParSep
168 // 2 - includes SP
169 // 4 - includes Close
170 hilet [prefix, close_sp_par_found] = [&]() {
171 using enum unicode_break_opportunity;
172
173 auto found = 0;
174 auto state = ' ';
175 for (auto j = i - 1; j >= 0; --j) {
176 if (not infos[j].is_skip()) {
177 switch (state) {
178 case ' ':
179 if (is_ParaSep(infos[j])) {
180 found |= 1;
181 state = 'p';
182 } else if (infos[j] == Sp) {
183 found |= 2;
184 state = 's';
185 } else if (infos[j] == Close) {
186 found |= 4;
187 state = 'c';
188 } else {
189 return std::make_pair(infos[j], found);
190 }
191 break;
192 case 'p': // We can only be in the state 'p' once.
193 case 's':
194 if (infos[j] == Sp) {
195 found |= 2;
196 state = 's';
197 } else if (infos[j] == Close) {
198 found |= 4;
199 state = 'c';
200 } else {
201 return std::make_pair(infos[j], found);
202 }
203 break;
204 case 'c':
205 if (infos[j] == Close) {
206 found |= 4;
207 state = 'c';
208 } else {
209 return std::make_pair(infos[j], found);
210 }
211 break;
212 }
213 }
214 }
215 return std::make_pair(unicode_sentence_break_info{}, 0);
216 }();
217 hilet optional_close = (close_sp_par_found & 3) == 0;
218 hilet optional_close_sp = (close_sp_par_found & 1) == 0;
219 hilet optional_close_sp_par = true;
220
221 hilet end_in_lower = [&]{
222 for (auto j = i; j < std::ssize(infos); ++j) {
223 if (not infos[j].is_skip()) {
224 if (infos[j] == Lower) {
225 return true;
226 } else if (infos[j] == OLetter or infos[j] == Upper or is_ParaSep(infos[j]) or is_SATerm(infos[j])) {
227 return false;
228 }
229 }
230 }
231 return false;
232 }();
233
234 r[i] = [&] () {
235 if (prev == ATerm and next == Numeric) {
236 return no; // SB6
237 } else if ((prev_prev == Upper or prev_prev == Lower) and prev == ATerm and next == Upper) {
238 return no; // SB7
239 } else if (prefix == ATerm and optional_close_sp and end_in_lower) {
240 return no; // SB8
241 } else if (is_SATerm(prefix) and optional_close_sp and (next == SContinue or is_SATerm(next))) {
242 return no; // SB8a
243 } else if (is_SATerm(prefix) and optional_close and (next == Close or next == Sp or is_ParaSep(next))) {
244 return no; // SB9
245 } else if (is_SATerm(prefix) and optional_close_sp and (next == Sp or is_ParaSep(next))) {
246 return no; // SB10
247 } else if (is_SATerm(prefix) and optional_close_sp_par) {
248 return yes; // SB11
249 } else {
250 return no; // SB998
251 }
252 }();
253 }
254}
255
256}
257
265template<typename It, typename ItEnd, typename DescriptionFunc>
266[[nodiscard]] inline unicode_break_vector unicode_sentence_break(It first, ItEnd last, DescriptionFunc const &description_func)
267{
268 auto size = narrow<size_t>(std::distance(first, last));
269 auto r = unicode_break_vector{size + 1, unicode_break_opportunity::unassigned};
270
272 infos.reserve(size);
273 std::transform(first, last, std::back_inserter(infos), [&] (hilet &item) {
274 hilet &description = description_func(item);
275 return detail::unicode_sentence_break_info{description.sentence_break_property()};
276 });
277
278 detail::unicode_sentence_break_SB1_SB4(r, infos);
279 detail::unicode_sentence_break_SB5(r, infos);
280 detail::unicode_sentence_break_SB6_SB998(r, infos);
281 return r;
282}
283
284
285}
#define hilet
Invariant should be the default for variables.
Definition utility.hpp:23
DOXYGEN BUG.
Definition algorithm.hpp:15
unicode_break_vector unicode_sentence_break(It first, ItEnd last, DescriptionFunc const &description_func)
The unicode word break algorithm UAX#29.
Definition unicode_sentence_break.hpp:266
Definition unicode_sentence_break.hpp:34
T back_inserter(T... args)
T distance(T... args)
T make_pair(T... args)
T next(T... args)
T prev(T... args)
T reserve(T... args)
T transform(T... args)