HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
unicode_sentence_break.hpp
Go to the documentation of this file.
1// Copyright Take Vos 2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
8#pragma once
9
10#include "ucd_sentence_break_properties.hpp"
11#include <tuple>
12
13namespace hi::inline v1 {
14
15namespace detail {
16
18public:
19 constexpr unicode_sentence_break_info() noexcept : _value(0)
20 {}
21 constexpr unicode_sentence_break_info(unicode_sentence_break_info const &) noexcept = default;
22 constexpr unicode_sentence_break_info(unicode_sentence_break_info &&) noexcept = default;
23 constexpr unicode_sentence_break_info &operator=(unicode_sentence_break_info const &) noexcept = default;
24 constexpr unicode_sentence_break_info &operator=(unicode_sentence_break_info &&) noexcept = default;
25
26 constexpr unicode_sentence_break_info(unicode_sentence_break_property const &sentence_break_property) noexcept : _value(to_underlying(sentence_break_property))
27 {}
28
29 constexpr unicode_sentence_break_info &make_skip() noexcept
30 {
31 _value |= 0x40;
32 return *this;
33 }
34
35 [[nodiscard]] constexpr bool is_skip() const noexcept
36 {
37 return to_bool(_value & 0x40);
38 }
39
40 [[nodiscard]] constexpr friend bool operator==(unicode_sentence_break_info const &lhs, unicode_sentence_break_property const &rhs) noexcept
41 {
42 return (lhs._value & 0x3f) == to_underlying(rhs);
43 }
44
45 [[nodiscard]] constexpr friend bool operator==(unicode_sentence_break_info const &, unicode_sentence_break_info const &) noexcept = default;
46
47 [[nodiscard]] constexpr friend bool is_ParaSep(unicode_sentence_break_info const &rhs) noexcept
48 {
49 return rhs == unicode_sentence_break_property::Sep or rhs == unicode_sentence_break_property::CR or rhs == unicode_sentence_break_property::LF;
50 }
51
52 [[nodiscard]] constexpr friend bool is_SATerm(unicode_sentence_break_info const &rhs) noexcept
53 {
54 return rhs == unicode_sentence_break_property::STerm or rhs == unicode_sentence_break_property::ATerm;
55 }
56
57private:
58 uint8_t _value;
59};
60
61[[nodiscard]] inline void unicode_sentence_break_SB1_SB4(
64{
65 using enum unicode_break_opportunity;
66 using enum unicode_sentence_break_property;
67
68 hi_axiom(r.size() == infos.size() + 1);
69
70 r.front() = yes; // SB1
71 r.back() = yes; // SB2
72
73 for (auto i = 1_uz; i < infos.size(); ++i) {
74 hilet prev = infos[i - 1];
75 hilet next = infos[i];
76
77 r[i] = [&] () {
78 if (prev == CR and next == LF) {
79 return no; // SB3
80 } else if (is_ParaSep(prev)) {
81 return yes; //SB4
82 } else {
83 return unassigned;
84 }
85 }();
86 }
87}
88
89[[nodiscard]] inline void unicode_sentence_break_SB5(
90 unicode_break_vector &r,
92{
93 using enum unicode_break_opportunity;
94 using enum unicode_sentence_break_property;
95
96 hi_axiom(r.size() == infos.size() + 1);
97
98 for (auto i = 1_uz; i < infos.size(); ++i) {
99 hilet prev = infos[i - 1];
100 auto &next = infos[i];
101
102 if ((not is_ParaSep(prev) and prev != CR and prev != LF) and (next == Extend or next == Format)) {
103 if (r[i] == unassigned) {
104 r[i] = no;
105 }
106 next.make_skip();
107 }
108 }
109}
110
111[[nodiscard]] inline void unicode_sentence_break_SB6_SB998(
112 unicode_break_vector &r,
114{
115 using enum unicode_break_opportunity;
116 using enum unicode_sentence_break_property;
117
118 hi_axiom(r.size() == infos.size() + 1);
119
120 for (auto i = 0_z; i < std::ssize(infos); ++i) {
121 hilet &next = infos[i];
122 if (r[i] != unassigned) {
123 continue;
124 }
125
126 hi_axiom(not next.is_skip());
127
129
130 hilet prev = [&] {
131 for (k = i - 1; k >= 0; --k) {
132 if (not infos[k].is_skip()) {
133 return infos[k];
134 }
135 }
136 return unicode_sentence_break_info{};
137 }();
138
139 hilet prev_prev = [&] {
140 for (--k; k >= 0; --k) {
141 if (not infos[k].is_skip()) {
142 return infos[k];
143 }
144 }
145 return unicode_sentence_break_info{};
146 }();
147
148 // close_sp
149 // 0 - no suffix
150 // 1 - ends in ParSep
151 // 2 - includes SP
152 // 4 - includes Close
153 hilet [prefix, close_sp_par_found] = [&]() {
154 using enum unicode_break_opportunity;
155
156 auto found = 0;
157 auto state = ' ';
158 for (auto j = i - 1; j >= 0; --j) {
159 if (not infos[j].is_skip()) {
160 switch (state) {
161 case ' ':
162 if (is_ParaSep(infos[j])) {
163 found |= 1;
164 state = 'p';
165 } else if (infos[j] == Sp) {
166 found |= 2;
167 state = 's';
168 } else if (infos[j] == Close) {
169 found |= 4;
170 state = 'c';
171 } else {
172 return std::make_pair(infos[j], found);
173 }
174 break;
175 case 'p': // We can only be in the state 'p' once.
176 case 's':
177 if (infos[j] == Sp) {
178 found |= 2;
179 state = 's';
180 } else if (infos[j] == Close) {
181 found |= 4;
182 state = 'c';
183 } else {
184 return std::make_pair(infos[j], found);
185 }
186 break;
187 case 'c':
188 if (infos[j] == Close) {
189 found |= 4;
190 state = 'c';
191 } else {
192 return std::make_pair(infos[j], found);
193 }
194 break;
195 }
196 }
197 }
198 return std::make_pair(unicode_sentence_break_info{}, 0);
199 }();
200 hilet optional_close = (close_sp_par_found & 3) == 0;
201 hilet optional_close_sp = (close_sp_par_found & 1) == 0;
202 hilet optional_close_sp_par = true;
203
204 hilet end_in_lower = [&]{
205 for (auto j = i; j < std::ssize(infos); ++j) {
206 if (not infos[j].is_skip()) {
207 if (infos[j] == Lower) {
208 return true;
209 } else if (infos[j] == OLetter or infos[j] == Upper or is_ParaSep(infos[j]) or is_SATerm(infos[j])) {
210 return false;
211 }
212 }
213 }
214 return false;
215 }();
216
217 r[i] = [&] () {
218 if (prev == ATerm and next == Numeric) {
219 return no; // SB6
220 } else if ((prev_prev == Upper or prev_prev == Lower) and prev == ATerm and next == Upper) {
221 return no; // SB7
222 } else if (prefix == ATerm and optional_close_sp and end_in_lower) {
223 return no; // SB8
224 } else if (is_SATerm(prefix) and optional_close_sp and (next == SContinue or is_SATerm(next))) {
225 return no; // SB8a
226 } else if (is_SATerm(prefix) and optional_close and (next == Close or next == Sp or is_ParaSep(next))) {
227 return no; // SB9
228 } else if (is_SATerm(prefix) and optional_close_sp and (next == Sp or is_ParaSep(next))) {
229 return no; // SB10
230 } else if (is_SATerm(prefix) and optional_close_sp_par) {
231 return yes; // SB11
232 } else {
233 return no; // SB998
234 }
235 }();
236 }
237}
238
239}
240
248template<typename It, typename ItEnd, typename CodePointFunc>
249[[nodiscard]] inline unicode_break_vector
250unicode_sentence_break(It first, ItEnd last, CodePointFunc const& code_point_func) noexcept
251{
252 auto size = narrow_cast<size_t>(std::distance(first, last));
253 auto r = unicode_break_vector{size + 1, unicode_break_opportunity::unassigned};
254
256 infos.reserve(size);
257 std::transform(first, last, std::back_inserter(infos), [&] (hilet &item) {
258 hilet code_point = code_point_func(item);
259 return detail::unicode_sentence_break_info{ucd_get_sentence_break_property(code_point)};
260 });
261
262 detail::unicode_sentence_break_SB1_SB4(r, infos);
263 detail::unicode_sentence_break_SB5(r, infos);
264 detail::unicode_sentence_break_SB6_SB998(r, infos);
265 return r;
266}
267
268
269}
#define hi_axiom(expression,...)
Specify an axiom; an expression that is true.
Definition assert.hpp:253
#define hilet
Invariant should be the default for variables.
Definition utility.hpp:23
DOXYGEN BUG.
Definition algorithm.hpp:13
unicode_break_vector unicode_sentence_break(It first, ItEnd last, CodePointFunc const &code_point_func) noexcept
The unicode word break algorithm UAX#29.
Definition unicode_sentence_break.hpp:250
Definition unicode_sentence_break.hpp:17
T back_inserter(T... args)
T distance(T... args)
T make_pair(T... args)
T next(T... args)
T prev(T... args)
T reserve(T... args)
T transform(T... args)