HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
unicode_sentence_break.hpp
Go to the documentation of this file.
1// Copyright Take Vos 2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
8#pragma once
9
10#include "ucd_sentence_break_properties.hpp"
11#include "unicode_break_opportunity.hpp"
12#include "../utility/utility.hpp"
13#include "../macros.hpp"
14#include <tuple>
15#include <vector>
16#include <iterator>
17#include <algorithm>
18
19hi_export_module(hikogui.unicode.unicode_sentence_break);
20
21hi_export namespace hi::inline v1 {
22
23namespace detail {
24
26public:
27 constexpr unicode_sentence_break_info() noexcept : _value(0)
28 {}
29 constexpr unicode_sentence_break_info(unicode_sentence_break_info const &) noexcept = default;
30 constexpr unicode_sentence_break_info(unicode_sentence_break_info &&) noexcept = default;
31 constexpr unicode_sentence_break_info &operator=(unicode_sentence_break_info const &) noexcept = default;
32 constexpr unicode_sentence_break_info &operator=(unicode_sentence_break_info &&) noexcept = default;
33
34 constexpr unicode_sentence_break_info(unicode_sentence_break_property const &sentence_break_property) noexcept : _value(std::to_underlying(sentence_break_property))
35 {}
36
37 constexpr unicode_sentence_break_info &make_skip() noexcept
38 {
39 _value |= 0x40;
40 return *this;
41 }
42
43 [[nodiscard]] constexpr bool is_skip() const noexcept
44 {
45 return to_bool(_value & 0x40);
46 }
47
48 [[nodiscard]] constexpr friend bool operator==(unicode_sentence_break_info const &lhs, unicode_sentence_break_property const &rhs) noexcept
49 {
50 return (lhs._value & 0x3f) == std::to_underlying(rhs);
51 }
52
53 [[nodiscard]] constexpr friend bool operator==(unicode_sentence_break_info const &, unicode_sentence_break_info const &) noexcept = default;
54
55 [[nodiscard]] constexpr friend bool is_ParaSep(unicode_sentence_break_info const &rhs) noexcept
56 {
57 return rhs == unicode_sentence_break_property::Sep or rhs == unicode_sentence_break_property::CR or rhs == unicode_sentence_break_property::LF;
58 }
59
60 [[nodiscard]] constexpr friend bool is_SATerm(unicode_sentence_break_info const &rhs) noexcept
61 {
62 return rhs == unicode_sentence_break_property::STerm or rhs == unicode_sentence_break_property::ATerm;
63 }
64
65private:
66 uint8_t _value;
67};
68
69inline void unicode_sentence_break_SB1_SB4(
72{
73 using enum unicode_break_opportunity;
74 using enum unicode_sentence_break_property;
75
76 hi_axiom(r.size() == infos.size() + 1);
77
78 r.front() = yes; // SB1
79 r.back() = yes; // SB2
80
81 for (auto i = 1_uz; i < infos.size(); ++i) {
82 auto const prev = infos[i - 1];
83 auto const next = infos[i];
84
85 r[i] = [&] () {
86 if (prev == CR and next == LF) {
87 return no; // SB3
88 } else if (is_ParaSep(prev)) {
89 return yes; //SB4
90 } else {
91 return unassigned;
92 }
93 }();
94 }
95}
96
97inline void unicode_sentence_break_SB5(
98 unicode_break_vector &r,
100{
101 using enum unicode_break_opportunity;
102 using enum unicode_sentence_break_property;
103
104 hi_axiom(r.size() == infos.size() + 1);
105
106 for (auto i = 1_uz; i < infos.size(); ++i) {
107 auto const prev = infos[i - 1];
108 auto &next = infos[i];
109
110 if ((not is_ParaSep(prev) and prev != CR and prev != LF) and (next == Extend or next == Format)) {
111 if (r[i] == unassigned) {
112 r[i] = no;
113 }
114 next.make_skip();
115 }
116 }
117}
118
119inline void unicode_sentence_break_SB6_SB998(
120 unicode_break_vector &r,
122{
123 using enum unicode_break_opportunity;
124 using enum unicode_sentence_break_property;
125
126 hi_axiom(r.size() == infos.size() + 1);
127
128 for (auto i = 0_z; i < std::ssize(infos); ++i) {
129 auto const &next = infos[i];
130 if (r[i] != unassigned) {
131 continue;
132 }
133
134 hi_axiom(not next.is_skip());
135
137
138 auto const prev = [&] {
139 for (k = i - 1; k >= 0; --k) {
140 if (not infos[k].is_skip()) {
141 return infos[k];
142 }
143 }
144 return unicode_sentence_break_info{};
145 }();
146
147 auto const prev_prev = [&] {
148 for (--k; k >= 0; --k) {
149 if (not infos[k].is_skip()) {
150 return infos[k];
151 }
152 }
153 return unicode_sentence_break_info{};
154 }();
155
156 // close_sp
157 // 0 - no suffix
158 // 1 - ends in ParSep
159 // 2 - includes SP
160 // 4 - includes Close
161 auto const [prefix, close_sp_par_found] = [&]() {
162 using enum unicode_break_opportunity;
163
164 auto found = 0;
165 auto state = ' ';
166 for (auto j = i - 1; j >= 0; --j) {
167 if (not infos[j].is_skip()) {
168 switch (state) {
169 case ' ':
170 if (is_ParaSep(infos[j])) {
171 found |= 1;
172 state = 'p';
173 } else if (infos[j] == Sp) {
174 found |= 2;
175 state = 's';
176 } else if (infos[j] == Close) {
177 found |= 4;
178 state = 'c';
179 } else {
180 return std::make_pair(infos[j], found);
181 }
182 break;
183 case 'p': // We can only be in the state 'p' once.
184 case 's':
185 if (infos[j] == Sp) {
186 found |= 2;
187 state = 's';
188 } else if (infos[j] == Close) {
189 found |= 4;
190 state = 'c';
191 } else {
192 return std::make_pair(infos[j], found);
193 }
194 break;
195 case 'c':
196 if (infos[j] == Close) {
197 found |= 4;
198 state = 'c';
199 } else {
200 return std::make_pair(infos[j], found);
201 }
202 break;
203 }
204 }
205 }
206 return std::make_pair(unicode_sentence_break_info{}, 0);
207 }();
208 auto const optional_close = (close_sp_par_found & 3) == 0;
209 auto const optional_close_sp = (close_sp_par_found & 1) == 0;
210 auto const optional_close_sp_par = true;
211
212 auto const end_in_lower = [&]{
213 for (auto j = i; j < std::ssize(infos); ++j) {
214 if (not infos[j].is_skip()) {
215 if (infos[j] == Lower) {
216 return true;
217 } else if (infos[j] == OLetter or infos[j] == Upper or is_ParaSep(infos[j]) or is_SATerm(infos[j])) {
218 return false;
219 }
220 }
221 }
222 return false;
223 }();
224
225 r[i] = [&] () {
226 if (prev == ATerm and next == Numeric) {
227 return no; // SB6
228 } else if ((prev_prev == Upper or prev_prev == Lower) and prev == ATerm and next == Upper) {
229 return no; // SB7
230 } else if (prefix == ATerm and optional_close_sp and end_in_lower) {
231 return no; // SB8
232 } else if (is_SATerm(prefix) and optional_close_sp and (next == SContinue or is_SATerm(next))) {
233 return no; // SB8a
234 } else if (is_SATerm(prefix) and optional_close and (next == Close or next == Sp or is_ParaSep(next))) {
235 return no; // SB9
236 } else if (is_SATerm(prefix) and optional_close_sp and (next == Sp or is_ParaSep(next))) {
237 return no; // SB10
238 } else if (is_SATerm(prefix) and optional_close_sp_par) {
239 return yes; // SB11
240 } else {
241 return no; // SB998
242 }
243 }();
244 }
245}
246
247}
248
256template<typename It, typename ItEnd, typename CodePointFunc>
257[[nodiscard]] inline unicode_break_vector
258unicode_sentence_break(It first, ItEnd last, CodePointFunc const& code_point_func) noexcept
259{
260 auto size = narrow_cast<size_t>(std::distance(first, last));
261 auto r = unicode_break_vector{size + 1, unicode_break_opportunity::unassigned};
262
264 infos.reserve(size);
265 std::transform(first, last, std::back_inserter(infos), [&] (auto const &item) {
266 auto const code_point = code_point_func(item);
267 return detail::unicode_sentence_break_info{ucd_get_sentence_break_property(code_point)};
268 });
269
270 detail::unicode_sentence_break_SB1_SB4(r, infos);
271 detail::unicode_sentence_break_SB5(r, infos);
272 detail::unicode_sentence_break_SB6_SB998(r, infos);
273 return r;
274}
275
276
277}
DOXYGEN BUG.
Definition algorithm_misc.hpp:20
unicode_break_vector unicode_sentence_break(It first, ItEnd last, CodePointFunc const &code_point_func) noexcept
The unicode word break algorithm UAX#29.
Definition unicode_sentence_break.hpp:258
Definition unicode_sentence_break.hpp:25
T back_inserter(T... args)
T distance(T... args)
T make_pair(T... args)
T next(T... args)
T prev(T... args)
T reserve(T... args)
T transform(T... args)