HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
unicode_word_break.hpp
Go to the documentation of this file.
1// Copyright Take Vos 2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
8#pragma once
9
10#include "unicode_break_opportunity.hpp"
11#include "ucd_general_categories.hpp"
12#include "ucd_grapheme_cluster_breaks.hpp"
13#include "ucd_word_break_properties.hpp"
14#include "../utility/module.hpp"
15#include <algorithm>
16#include <vector>
17
18namespace hi::inline v1 {
19
20namespace detail {
21
23public:
24 constexpr unicode_word_break_info() noexcept : _value(0) {}
25 constexpr unicode_word_break_info(unicode_word_break_info const&) noexcept = default;
26 constexpr unicode_word_break_info(unicode_word_break_info&&) noexcept = default;
27 constexpr unicode_word_break_info& operator=(unicode_word_break_info const&) noexcept = default;
28 constexpr unicode_word_break_info& operator=(unicode_word_break_info&&) noexcept = default;
29
30 constexpr unicode_word_break_info(unicode_word_break_property const& word_break_property, bool pictographic) noexcept :
31 _value(to_underlying(word_break_property) | (wide_cast<uint8_t>(pictographic) << 7))
32 {
33 }
34
35 constexpr unicode_word_break_info& make_skip() noexcept
36 {
37 _value |= 0x40;
38 return *this;
39 }
40
41 [[nodiscard]] constexpr bool is_skip() const noexcept
42 {
43 return to_bool(_value & 0x40);
44 }
45
46 [[nodiscard]] constexpr bool is_pictographic() const noexcept
47 {
48 return to_bool(_value & 0x80);
49 }
50
51 [[nodiscard]] constexpr friend bool
52 operator==(unicode_word_break_info const& lhs, unicode_word_break_property const& rhs) noexcept
53 {
54 return (lhs._value & 0x3f) == to_underlying(rhs);
55 }
56
57 [[nodiscard]] constexpr friend bool
58 operator==(unicode_word_break_info const&, unicode_word_break_info const&) noexcept = default;
59
60 [[nodiscard]] constexpr friend bool is_AHLetter(unicode_word_break_info const& rhs) noexcept
61 {
62 return rhs == unicode_word_break_property::ALetter or rhs == unicode_word_break_property::Hebrew_Letter;
63 }
64
65 [[nodiscard]] constexpr friend bool is_MidNumLetQ(unicode_word_break_info const& rhs) noexcept
66 {
67 return rhs == unicode_word_break_property::MidNumLet or rhs == unicode_word_break_property::Single_Quote;
68 }
69
70private:
71 uint8_t _value;
72};
73
74[[nodiscard]] inline void
75unicode_word_break_WB1_WB3d(unicode_break_vector& r, std::vector<unicode_word_break_info>& infos) noexcept
76{
77 using enum unicode_break_opportunity;
78 using enum unicode_word_break_property;
79
80 hi_axiom(r.size() == infos.size() + 1);
81
82 r.front() = yes; // WB1
83 r.back() = yes; // WB2
84
85 for (auto i = 1_uz; i < infos.size(); ++i) {
86 hilet prev = infos[i - 1];
87 hilet next = infos[i];
88
89 r[i] = [&]() {
90 if (prev == CR and next == LF) {
91 return no; // WB3
92 } else if (prev == Newline or prev == CR or prev == LF) {
93 return yes; // WB3a
94 } else if (next == Newline or next == CR or next == LF) {
95 return yes; // WB3b
96 } else if (prev == ZWJ and next.is_pictographic()) {
97 return no; // WB3c
98 } else if (prev == WSegSpace and next == WSegSpace) {
99 return no; // WB3d
100 } else {
101 return unassigned;
102 }
103 }();
104 }
105}
106
107[[nodiscard]] inline void unicode_word_break_WB4(unicode_break_vector& r, std::vector<unicode_word_break_info>& infos) noexcept
108{
109 using enum unicode_break_opportunity;
110 using enum unicode_word_break_property;
111
112 hi_axiom(r.size() == infos.size() + 1);
113
114 for (auto i = 1_uz; i < infos.size(); ++i) {
115 hilet prev = infos[i - 1];
116 auto& next = infos[i];
117
118 if ((prev != Newline and prev != CR and prev != LF) and (next == Extend or next == Format or next == ZWJ)) {
119 if (r[i] == unassigned) {
120 r[i] = no;
121 }
122 next.make_skip();
123 }
124 }
125}
126
127[[nodiscard]] inline void
128unicode_word_break_WB5_WB999(unicode_break_vector& r, std::vector<unicode_word_break_info>& infos) noexcept
129{
130 using enum unicode_break_opportunity;
131 using enum unicode_word_break_property;
132
133 hi_axiom(r.size() == infos.size() + 1);
134
135 for (auto i = 0_uz; i != infos.size(); ++i) {
136 if (r[i] != unassigned) {
137 continue;
138 }
139
140 hilet& next = infos[i];
141
142 // WB4: (Extend | Format | ZWJ)* is assigned to no-break.
143 hi_axiom(not next.is_skip());
144
145 auto prev_i = narrow_cast<ptrdiff_t>(i) - 1;
146 auto prev = unicode_word_break_info{};
147 for (; prev_i >= 0 ; --prev_i) {
148 if (not infos[prev_i].is_skip()) {
149 prev = infos[prev_i];
150 break;
151 }
152 }
153
154 auto prev_prev_i = prev_i - 1;
155 auto prev_prev = unicode_word_break_info{};
156 for (; prev_prev_i >= 0; --prev_prev_i) {
157 if (not infos[prev_prev_i].is_skip()) {
158 prev_prev = infos[prev_prev_i];
159 break;
160 }
161 }
162
163 auto next_next_i = i + 1;
164 auto next_next = unicode_word_break_info{};
165 for (; next_next_i != infos.size(); ++next_next_i) {
166 if (not infos[next_next_i].is_skip()) {
167 next_next = infos[next_next_i];
168 break;
169 }
170 }
171
172 auto RI_i = prev_i - 1;
173 auto RI_is_pair = true;
174 if (prev == Regional_Indicator and next == Regional_Indicator) {
175 // Track back before prev, and count consecutive RI.
176 for (; RI_i >= 0; --RI_i) {
177 if (infos[RI_i].is_skip()) {
178 continue;
179 } else if (infos[RI_i] != Regional_Indicator) {
180 break;
181 }
182 RI_is_pair = not RI_is_pair;
183 }
184 }
185
186 r[i] = [&] {
187 if (is_AHLetter(prev) and is_AHLetter(next)) {
188 return no; // WB5
189 } else if (is_AHLetter(prev) and (next == MidLetter or is_MidNumLetQ(next)) and is_AHLetter(next_next)) {
190 return no; // WB6
191 } else if (is_AHLetter(prev_prev) and (prev == MidLetter or is_MidNumLetQ(prev)) and is_AHLetter(next)) {
192 return no; // WB7
193 } else if (prev == Hebrew_Letter and next == Single_Quote) {
194 return no; // WB7a
195 } else if (prev == Hebrew_Letter and next == Double_Quote and next_next == Hebrew_Letter) {
196 return no; // WB7b
197 } else if (prev_prev == Hebrew_Letter and prev == Double_Quote and next == Hebrew_Letter) {
198 return no; // WB7c
199 } else if (prev == Numeric and next == Numeric) {
200 return no; // WB8
201 } else if (is_AHLetter(prev) and next == Numeric) {
202 return no; // WB9
203 } else if (prev == Numeric and is_AHLetter(next)) {
204 return no; // WB10
205 } else if (prev_prev == Numeric and (prev == MidNum or is_MidNumLetQ(prev)) and next == Numeric) {
206 return no; // WB11
207 } else if (prev == Numeric and (next == MidNum or is_MidNumLetQ(next)) and next_next == Numeric) {
208 return no; // WB12
209 } else if (prev == Katakana and next == Katakana) {
210 return no; // WB13
211 } else if (
212 (is_AHLetter(prev) or prev == Numeric or prev == Katakana or prev == ExtendNumLet) and next == ExtendNumLet) {
213 return no; // WB13a
214 } else if (prev == ExtendNumLet and (is_AHLetter(next) or next == Numeric or next == Katakana)) {
215 return no; // WB13b
216 } else if (prev == Regional_Indicator and next == Regional_Indicator and RI_is_pair) {
217 return no; // WB15 WB16
218 } else {
219 return yes; // WB999
220 }
221 }();
222 }
223}
224
225} // namespace detail
226
234template<typename It, typename ItEnd, typename CodePointFunc>
235[[nodiscard]] inline unicode_break_vector unicode_word_break(It first, ItEnd last, CodePointFunc const& code_point_func) noexcept
236{
237 auto size = narrow_cast<size_t>(std::distance(first, last));
238 auto r = unicode_break_vector{size + 1, unicode_break_opportunity::unassigned};
239
241 infos.reserve(size);
242 std::transform(first, last, std::back_inserter(infos), [&](hilet& item) {
243 hilet code_point = code_point_func(item);
244 hilet word_break_property = ucd_get_word_break_property(code_point);
245 hilet grapheme_cluster_break = ucd_get_grapheme_cluster_break(code_point);
247 word_break_property, grapheme_cluster_break == unicode_grapheme_cluster_break::Extended_Pictographic};
248 });
249
250 detail::unicode_word_break_WB1_WB3d(r, infos);
251 detail::unicode_word_break_WB4(r, infos);
252 detail::unicode_word_break_WB5_WB999(r, infos);
253 return r;
254}
255
271void wrap_lines(auto first, auto last, float max_width, auto get_width, auto get_code_point, auto set_code_point) noexcept
272{
273 using enum unicode_general_category;
274
275 auto it_at_last_space = last;
276 float width_at_last_space = 0.0;
277 float current_width = 0.0;
278
279 for (auto it = first; it != last; ++it) {
280 hilet code_point = get_code_point(*it);
281 hilet general_category = ucd_get_general_category(code_point);
282
283 if (general_category == Zp || general_category == Zl) {
284 // Reset the line on existing line and paragraph separator.
285 it_at_last_space = last;
286 width_at_last_space = 0.0f;
287 current_width = 0.0;
288 continue;
289
290 } else if (general_category == Zs) {
291 // Remember the length of the line at the end of the word.
292 it_at_last_space = it;
293 width_at_last_space = current_width;
294 }
295
296 current_width += get_width(*it);
297 if (current_width >= max_width && it_at_last_space != last) {
298 // The line is too long, replace the last space with a line separator.
299 set_code_point(*it, U'\u2028');
300 it_at_last_space = last;
301 width_at_last_space = 0.0f;
302 current_width = 0.0;
303 continue;
304 }
305 }
306}
307
308} // namespace hi::inline v1
#define hi_axiom(expression,...)
Specify an axiom; an expression that is true.
Definition assert.hpp:253
#define hilet
Invariant should be the default for variables.
Definition utility.hpp:23
DOXYGEN BUG.
Definition algorithm.hpp:13
void wrap_lines(auto first, auto last, float max_width, auto get_width, auto get_code_point, auto set_code_point) noexcept
Wrap lines in text that are too wide.
Definition unicode_word_break.hpp:271
unicode_break_vector unicode_word_break(It first, ItEnd last, CodePointFunc const &code_point_func) noexcept
The unicode word break algorithm UAX#29.
Definition unicode_word_break.hpp:235
Definition unicode_word_break.hpp:22
T back_inserter(T... args)
T distance(T... args)
T next(T... args)
T prev(T... args)
T reserve(T... args)
T transform(T... args)