HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
unicode_word_break.hpp
Go to the documentation of this file.
1// Copyright Take Vos 2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
8#pragma once
9
10#include "unicode_break_opportunity.hpp"
11#include "ucd_general_categories.hpp"
12#include "ucd_grapheme_cluster_breaks.hpp"
13#include "ucd_word_break_properties.hpp"
14#include "../utility/utility.hpp"
15#include "../macros.hpp"
16#include <algorithm>
17#include <vector>
18
19
20
21namespace hi::inline v1 {
22
23namespace detail {
24
26public:
27 constexpr unicode_word_break_info() noexcept : _value(0) {}
28 constexpr unicode_word_break_info(unicode_word_break_info const&) noexcept = default;
29 constexpr unicode_word_break_info(unicode_word_break_info&&) noexcept = default;
30 constexpr unicode_word_break_info& operator=(unicode_word_break_info const&) noexcept = default;
31 constexpr unicode_word_break_info& operator=(unicode_word_break_info&&) noexcept = default;
32
33 constexpr unicode_word_break_info(unicode_word_break_property const& word_break_property, bool pictographic) noexcept :
34 _value(std::to_underlying(word_break_property) | (wide_cast<uint8_t>(pictographic) << 7))
35 {
36 }
37
38 constexpr unicode_word_break_info& make_skip() noexcept
39 {
40 _value |= 0x40;
41 return *this;
42 }
43
44 [[nodiscard]] constexpr bool is_skip() const noexcept
45 {
46 return to_bool(_value & 0x40);
47 }
48
49 [[nodiscard]] constexpr bool is_pictographic() const noexcept
50 {
51 return to_bool(_value & 0x80);
52 }
53
54 [[nodiscard]] constexpr friend bool
55 operator==(unicode_word_break_info const& lhs, unicode_word_break_property const& rhs) noexcept
56 {
57 return (lhs._value & 0x3f) == std::to_underlying(rhs);
58 }
59
60 [[nodiscard]] constexpr friend bool
61 operator==(unicode_word_break_info const&, unicode_word_break_info const&) noexcept = default;
62
63 [[nodiscard]] constexpr friend bool is_AHLetter(unicode_word_break_info const& rhs) noexcept
64 {
65 return rhs == unicode_word_break_property::ALetter or rhs == unicode_word_break_property::Hebrew_Letter;
66 }
67
68 [[nodiscard]] constexpr friend bool is_MidNumLetQ(unicode_word_break_info const& rhs) noexcept
69 {
70 return rhs == unicode_word_break_property::MidNumLet or rhs == unicode_word_break_property::Single_Quote;
71 }
72
73private:
74 uint8_t _value;
75};
76
77[[nodiscard]] inline void
78unicode_word_break_WB1_WB3d(unicode_break_vector& r, std::vector<unicode_word_break_info>& infos) noexcept
79{
80 using enum unicode_break_opportunity;
81 using enum unicode_word_break_property;
82
83 hi_axiom(r.size() == infos.size() + 1);
84
85 r.front() = yes; // WB1
86 r.back() = yes; // WB2
87
88 for (auto i = 1_uz; i < infos.size(); ++i) {
89 hilet prev = infos[i - 1];
90 hilet next = infos[i];
91
92 r[i] = [&]() {
93 if (prev == CR and next == LF) {
94 return no; // WB3
95 } else if (prev == Newline or prev == CR or prev == LF) {
96 return yes; // WB3a
97 } else if (next == Newline or next == CR or next == LF) {
98 return yes; // WB3b
99 } else if (prev == ZWJ and next.is_pictographic()) {
100 return no; // WB3c
101 } else if (prev == WSegSpace and next == WSegSpace) {
102 return no; // WB3d
103 } else {
104 return unassigned;
105 }
106 }();
107 }
108}
109
110[[nodiscard]] inline void unicode_word_break_WB4(unicode_break_vector& r, std::vector<unicode_word_break_info>& infos) noexcept
111{
112 using enum unicode_break_opportunity;
113 using enum unicode_word_break_property;
114
115 hi_axiom(r.size() == infos.size() + 1);
116
117 for (auto i = 1_uz; i < infos.size(); ++i) {
118 hilet prev = infos[i - 1];
119 auto& next = infos[i];
120
121 if ((prev != Newline and prev != CR and prev != LF) and (next == Extend or next == Format or next == ZWJ)) {
122 if (r[i] == unassigned) {
123 r[i] = no;
124 }
125 next.make_skip();
126 }
127 }
128}
129
130[[nodiscard]] inline void
131unicode_word_break_WB5_WB999(unicode_break_vector& r, std::vector<unicode_word_break_info>& infos) noexcept
132{
133 using enum unicode_break_opportunity;
134 using enum unicode_word_break_property;
135
136 hi_axiom(r.size() == infos.size() + 1);
137
138 for (auto i = 0_uz; i != infos.size(); ++i) {
139 if (r[i] != unassigned) {
140 continue;
141 }
142
143 hilet& next = infos[i];
144
145 // WB4: (Extend | Format | ZWJ)* is assigned to no-break.
146 hi_axiom(not next.is_skip());
147
148 auto prev_i = narrow_cast<ptrdiff_t>(i) - 1;
149 auto prev = unicode_word_break_info{};
150 for (; prev_i >= 0 ; --prev_i) {
151 if (not infos[prev_i].is_skip()) {
152 prev = infos[prev_i];
153 break;
154 }
155 }
156
157 auto prev_prev_i = prev_i - 1;
158 auto prev_prev = unicode_word_break_info{};
159 for (; prev_prev_i >= 0; --prev_prev_i) {
160 if (not infos[prev_prev_i].is_skip()) {
161 prev_prev = infos[prev_prev_i];
162 break;
163 }
164 }
165
166 auto next_next_i = i + 1;
167 auto next_next = unicode_word_break_info{};
168 for (; next_next_i != infos.size(); ++next_next_i) {
169 if (not infos[next_next_i].is_skip()) {
170 next_next = infos[next_next_i];
171 break;
172 }
173 }
174
175 auto RI_i = prev_i - 1;
176 auto RI_is_pair = true;
177 if (prev == Regional_Indicator and next == Regional_Indicator) {
178 // Track back before prev, and count consecutive RI.
179 for (; RI_i >= 0; --RI_i) {
180 if (infos[RI_i].is_skip()) {
181 continue;
182 } else if (infos[RI_i] != Regional_Indicator) {
183 break;
184 }
185 RI_is_pair = not RI_is_pair;
186 }
187 }
188
189 r[i] = [&] {
190 if (is_AHLetter(prev) and is_AHLetter(next)) {
191 return no; // WB5
192 } else if (is_AHLetter(prev) and (next == MidLetter or is_MidNumLetQ(next)) and is_AHLetter(next_next)) {
193 return no; // WB6
194 } else if (is_AHLetter(prev_prev) and (prev == MidLetter or is_MidNumLetQ(prev)) and is_AHLetter(next)) {
195 return no; // WB7
196 } else if (prev == Hebrew_Letter and next == Single_Quote) {
197 return no; // WB7a
198 } else if (prev == Hebrew_Letter and next == Double_Quote and next_next == Hebrew_Letter) {
199 return no; // WB7b
200 } else if (prev_prev == Hebrew_Letter and prev == Double_Quote and next == Hebrew_Letter) {
201 return no; // WB7c
202 } else if (prev == Numeric and next == Numeric) {
203 return no; // WB8
204 } else if (is_AHLetter(prev) and next == Numeric) {
205 return no; // WB9
206 } else if (prev == Numeric and is_AHLetter(next)) {
207 return no; // WB10
208 } else if (prev_prev == Numeric and (prev == MidNum or is_MidNumLetQ(prev)) and next == Numeric) {
209 return no; // WB11
210 } else if (prev == Numeric and (next == MidNum or is_MidNumLetQ(next)) and next_next == Numeric) {
211 return no; // WB12
212 } else if (prev == Katakana and next == Katakana) {
213 return no; // WB13
214 } else if (
215 (is_AHLetter(prev) or prev == Numeric or prev == Katakana or prev == ExtendNumLet) and next == ExtendNumLet) {
216 return no; // WB13a
217 } else if (prev == ExtendNumLet and (is_AHLetter(next) or next == Numeric or next == Katakana)) {
218 return no; // WB13b
219 } else if (prev == Regional_Indicator and next == Regional_Indicator and RI_is_pair) {
220 return no; // WB15 WB16
221 } else {
222 return yes; // WB999
223 }
224 }();
225 }
226}
227
228} // namespace detail
229
237template<typename It, typename ItEnd, typename CodePointFunc>
239{
240 auto size = narrow_cast<size_t>(std::distance(first, last));
241 auto r = unicode_break_vector{size + 1, unicode_break_opportunity::unassigned};
242
244 infos.reserve(size);
245 std::transform(first, last, std::back_inserter(infos), [&](hilet& item) {
246 hilet code_point = code_point_func(item);
247 hilet word_break_property = ucd_get_word_break_property(code_point);
248 hilet grapheme_cluster_break = ucd_get_grapheme_cluster_break(code_point);
250 word_break_property, grapheme_cluster_break == unicode_grapheme_cluster_break::Extended_Pictographic};
251 });
252
253 detail::unicode_word_break_WB1_WB3d(r, infos);
254 detail::unicode_word_break_WB4(r, infos);
255 detail::unicode_word_break_WB5_WB999(r, infos);
256 return r;
257}
258
274void wrap_lines(auto first, auto last, float max_width, auto get_width, auto get_code_point, auto set_code_point) noexcept
275{
276 using enum unicode_general_category;
277
278 auto it_at_last_space = last;
279 float width_at_last_space = 0.0;
280 float current_width = 0.0;
281
282 for (auto it = first; it != last; ++it) {
283 hilet code_point = get_code_point(*it);
284 hilet general_category = ucd_get_general_category(code_point);
285
286 if (general_category == Zp || general_category == Zl) {
287 // Reset the line on existing line and paragraph separator.
288 it_at_last_space = last;
289 width_at_last_space = 0.0f;
290 current_width = 0.0;
291 continue;
292
293 } else if (general_category == Zs) {
294 // Remember the length of the line at the end of the word.
297 }
298
300 if (current_width >= max_width && it_at_last_space != last) {
301 // The line is too long, replace the last space with a line separator.
302 set_code_point(*it, U'\u2028');
303 it_at_last_space = last;
304 width_at_last_space = 0.0f;
305 current_width = 0.0;
306 continue;
307 }
308 }
309}
310
311} // namespace hi::inline v1
DOXYGEN BUG.
Definition algorithm.hpp:16
void wrap_lines(auto first, auto last, float max_width, auto get_width, auto get_code_point, auto set_code_point) noexcept
Wrap lines in text that are too wide.
Definition unicode_word_break.hpp:274
unicode_break_vector unicode_word_break(It first, ItEnd last, CodePointFunc const &code_point_func) noexcept
The unicode word break algorithm UAX#29.
Definition unicode_word_break.hpp:238
constexpr Out narrow_cast(In const &rhs) noexcept
Cast numeric values without loss of precision.
Definition cast.hpp:377
Definition unicode_word_break.hpp:25
T back_inserter(T... args)
T distance(T... args)
T next(T... args)
T prev(T... args)
T reserve(T... args)
T transform(T... args)