HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
unicode_sentence_break.hpp
Go to the documentation of this file.
1// Copyright Take Vos 2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
8#pragma once
9
10#include "ucd_sentence_break_properties.hpp"
11#include "../macros.hpp"
12#include <tuple>
13
14namespace hi::inline v1 {
15
16namespace detail {
17
19public:
20 constexpr unicode_sentence_break_info() noexcept : _value(0)
21 {}
22 constexpr unicode_sentence_break_info(unicode_sentence_break_info const &) noexcept = default;
23 constexpr unicode_sentence_break_info(unicode_sentence_break_info &&) noexcept = default;
24 constexpr unicode_sentence_break_info &operator=(unicode_sentence_break_info const &) noexcept = default;
25 constexpr unicode_sentence_break_info &operator=(unicode_sentence_break_info &&) noexcept = default;
26
27 constexpr unicode_sentence_break_info(unicode_sentence_break_property const &sentence_break_property) noexcept : _value(std::to_underlying(sentence_break_property))
28 {}
29
30 constexpr unicode_sentence_break_info &make_skip() noexcept
31 {
32 _value |= 0x40;
33 return *this;
34 }
35
36 [[nodiscard]] constexpr bool is_skip() const noexcept
37 {
38 return to_bool(_value & 0x40);
39 }
40
41 [[nodiscard]] constexpr friend bool operator==(unicode_sentence_break_info const &lhs, unicode_sentence_break_property const &rhs) noexcept
42 {
43 return (lhs._value & 0x3f) == std::to_underlying(rhs);
44 }
45
46 [[nodiscard]] constexpr friend bool operator==(unicode_sentence_break_info const &, unicode_sentence_break_info const &) noexcept = default;
47
48 [[nodiscard]] constexpr friend bool is_ParaSep(unicode_sentence_break_info const &rhs) noexcept
49 {
50 return rhs == unicode_sentence_break_property::Sep or rhs == unicode_sentence_break_property::CR or rhs == unicode_sentence_break_property::LF;
51 }
52
53 [[nodiscard]] constexpr friend bool is_SATerm(unicode_sentence_break_info const &rhs) noexcept
54 {
55 return rhs == unicode_sentence_break_property::STerm or rhs == unicode_sentence_break_property::ATerm;
56 }
57
58private:
59 uint8_t _value;
60};
61
62[[nodiscard]] inline void unicode_sentence_break_SB1_SB4(
65{
66 using enum unicode_break_opportunity;
67 using enum unicode_sentence_break_property;
68
69 hi_axiom(r.size() == infos.size() + 1);
70
71 r.front() = yes; // SB1
72 r.back() = yes; // SB2
73
74 for (auto i = 1_uz; i < infos.size(); ++i) {
75 hilet prev = infos[i - 1];
76 hilet next = infos[i];
77
78 r[i] = [&] () {
79 if (prev == CR and next == LF) {
80 return no; // SB3
81 } else if (is_ParaSep(prev)) {
82 return yes; //SB4
83 } else {
84 return unassigned;
85 }
86 }();
87 }
88}
89
90[[nodiscard]] inline void unicode_sentence_break_SB5(
91 unicode_break_vector &r,
93{
94 using enum unicode_break_opportunity;
95 using enum unicode_sentence_break_property;
96
97 hi_axiom(r.size() == infos.size() + 1);
98
99 for (auto i = 1_uz; i < infos.size(); ++i) {
100 hilet prev = infos[i - 1];
101 auto &next = infos[i];
102
103 if ((not is_ParaSep(prev) and prev != CR and prev != LF) and (next == Extend or next == Format)) {
104 if (r[i] == unassigned) {
105 r[i] = no;
106 }
107 next.make_skip();
108 }
109 }
110}
111
112[[nodiscard]] inline void unicode_sentence_break_SB6_SB998(
113 unicode_break_vector &r,
115{
116 using enum unicode_break_opportunity;
117 using enum unicode_sentence_break_property;
118
119 hi_axiom(r.size() == infos.size() + 1);
120
121 for (auto i = 0_z; i < std::ssize(infos); ++i) {
122 hilet &next = infos[i];
123 if (r[i] != unassigned) {
124 continue;
125 }
126
127 hi_axiom(not next.is_skip());
128
130
131 hilet prev = [&] {
132 for (k = i - 1; k >= 0; --k) {
133 if (not infos[k].is_skip()) {
134 return infos[k];
135 }
136 }
137 return unicode_sentence_break_info{};
138 }();
139
140 hilet prev_prev = [&] {
141 for (--k; k >= 0; --k) {
142 if (not infos[k].is_skip()) {
143 return infos[k];
144 }
145 }
146 return unicode_sentence_break_info{};
147 }();
148
149 // close_sp
150 // 0 - no suffix
151 // 1 - ends in ParSep
152 // 2 - includes SP
153 // 4 - includes Close
154 hilet [prefix, close_sp_par_found] = [&]() {
155 using enum unicode_break_opportunity;
156
157 auto found = 0;
158 auto state = ' ';
159 for (auto j = i - 1; j >= 0; --j) {
160 if (not infos[j].is_skip()) {
161 switch (state) {
162 case ' ':
163 if (is_ParaSep(infos[j])) {
164 found |= 1;
165 state = 'p';
166 } else if (infos[j] == Sp) {
167 found |= 2;
168 state = 's';
169 } else if (infos[j] == Close) {
170 found |= 4;
171 state = 'c';
172 } else {
173 return std::make_pair(infos[j], found);
174 }
175 break;
176 case 'p': // We can only be in the state 'p' once.
177 case 's':
178 if (infos[j] == Sp) {
179 found |= 2;
180 state = 's';
181 } else if (infos[j] == Close) {
182 found |= 4;
183 state = 'c';
184 } else {
185 return std::make_pair(infos[j], found);
186 }
187 break;
188 case 'c':
189 if (infos[j] == Close) {
190 found |= 4;
191 state = 'c';
192 } else {
193 return std::make_pair(infos[j], found);
194 }
195 break;
196 }
197 }
198 }
199 return std::make_pair(unicode_sentence_break_info{}, 0);
200 }();
201 hilet optional_close = (close_sp_par_found & 3) == 0;
202 hilet optional_close_sp = (close_sp_par_found & 1) == 0;
203 hilet optional_close_sp_par = true;
204
205 hilet end_in_lower = [&]{
206 for (auto j = i; j < std::ssize(infos); ++j) {
207 if (not infos[j].is_skip()) {
208 if (infos[j] == Lower) {
209 return true;
210 } else if (infos[j] == OLetter or infos[j] == Upper or is_ParaSep(infos[j]) or is_SATerm(infos[j])) {
211 return false;
212 }
213 }
214 }
215 return false;
216 }();
217
218 r[i] = [&] () {
219 if (prev == ATerm and next == Numeric) {
220 return no; // SB6
221 } else if ((prev_prev == Upper or prev_prev == Lower) and prev == ATerm and next == Upper) {
222 return no; // SB7
223 } else if (prefix == ATerm and optional_close_sp and end_in_lower) {
224 return no; // SB8
225 } else if (is_SATerm(prefix) and optional_close_sp and (next == SContinue or is_SATerm(next))) {
226 return no; // SB8a
227 } else if (is_SATerm(prefix) and optional_close and (next == Close or next == Sp or is_ParaSep(next))) {
228 return no; // SB9
229 } else if (is_SATerm(prefix) and optional_close_sp and (next == Sp or is_ParaSep(next))) {
230 return no; // SB10
231 } else if (is_SATerm(prefix) and optional_close_sp_par) {
232 return yes; // SB11
233 } else {
234 return no; // SB998
235 }
236 }();
237 }
238}
239
240}
241
249template<typename It, typename ItEnd, typename CodePointFunc>
250[[nodiscard]] inline unicode_break_vector
251unicode_sentence_break(It first, ItEnd last, CodePointFunc const& code_point_func) noexcept
252{
253 auto size = narrow_cast<size_t>(std::distance(first, last));
254 auto r = unicode_break_vector{size + 1, unicode_break_opportunity::unassigned};
255
257 infos.reserve(size);
258 std::transform(first, last, std::back_inserter(infos), [&] (hilet &item) {
259 hilet code_point = code_point_func(item);
260 return detail::unicode_sentence_break_info{ucd_get_sentence_break_property(code_point)};
261 });
262
263 detail::unicode_sentence_break_SB1_SB4(r, infos);
264 detail::unicode_sentence_break_SB5(r, infos);
265 detail::unicode_sentence_break_SB6_SB998(r, infos);
266 return r;
267}
268
269
270}
DOXYGEN BUG.
Definition algorithm.hpp:16
unicode_break_vector unicode_sentence_break(It first, ItEnd last, CodePointFunc const &code_point_func) noexcept
The unicode word break algorithm UAX#29.
Definition unicode_sentence_break.hpp:251
constexpr Out narrow_cast(In const &rhs) noexcept
Cast numeric values without loss of precision.
Definition cast.hpp:377
Definition unicode_sentence_break.hpp:18
T back_inserter(T... args)
T distance(T... args)
T make_pair(T... args)
T next(T... args)
T prev(T... args)
T reserve(T... args)
T transform(T... args)