HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
unicode_word_break.hpp
Go to the documentation of this file.
1// Copyright Take Vos 2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
8#pragma once
9
10#include "unicode_grapheme_cluster_break.hpp"
11#include "unicode_break_opportunity.hpp"
12#include "../cast.hpp"
13#include <algorithm>
14#include <vector>
15
16namespace hi::inline v1 {
17
18enum class unicode_word_break_property : uint8_t {
19 Other,
20 CR,
21 LF,
22 Newline,
23 Extend,
24 ZWJ,
25 Regional_Indicator,
26 Format,
27 Katakana,
28 Hebrew_Letter,
29 ALetter,
30 Single_Quote,
31 Double_Quote,
32 MidNumLet,
33 MidLetter,
34 MidNum,
35 Numeric,
36 ExtendNumLet,
37 WSegSpace
38};
39
40namespace detail {
41
43public:
44 constexpr unicode_word_break_info() noexcept : _value(0) {}
45 constexpr unicode_word_break_info(unicode_word_break_info const&) noexcept = default;
46 constexpr unicode_word_break_info(unicode_word_break_info&&) noexcept = default;
47 constexpr unicode_word_break_info& operator=(unicode_word_break_info const&) noexcept = default;
48 constexpr unicode_word_break_info& operator=(unicode_word_break_info&&) noexcept = default;
49
50 constexpr unicode_word_break_info(unicode_word_break_property const& word_break_property, bool pictographic) noexcept :
51 _value(to_underlying(word_break_property) | (static_cast<uint8_t>(pictographic) << 7))
52 {
53 }
54
55 constexpr unicode_word_break_info& make_skip() noexcept
56 {
57 _value |= 0x40;
58 return *this;
59 }
60
61 [[nodiscard]] constexpr bool is_skip() const noexcept
62 {
63 return to_bool(_value & 0x40);
64 }
65
66 [[nodiscard]] constexpr bool is_pictographic() const noexcept
67 {
68 return to_bool(_value & 0x80);
69 }
70
71 [[nodiscard]] constexpr friend bool
72 operator==(unicode_word_break_info const& lhs, unicode_word_break_property const& rhs) noexcept
73 {
74 return (lhs._value & 0x3f) == to_underlying(rhs);
75 }
76
77 [[nodiscard]] constexpr friend bool
78 operator==(unicode_word_break_info const&, unicode_word_break_info const&) noexcept = default;
79
80 [[nodiscard]] constexpr friend bool is_AHLetter(unicode_word_break_info const& rhs) noexcept
81 {
82 return rhs == unicode_word_break_property::ALetter or rhs == unicode_word_break_property::Hebrew_Letter;
83 }
84
85 [[nodiscard]] constexpr friend bool is_MidNumLetQ(unicode_word_break_info const& rhs) noexcept
86 {
87 return rhs == unicode_word_break_property::MidNumLet or rhs == unicode_word_break_property::Single_Quote;
88 }
89
90private:
91 uint8_t _value;
92};
93
94[[nodiscard]] inline void
95unicode_word_break_WB1_WB3d(unicode_break_vector& r, std::vector<unicode_word_break_info>& infos) noexcept
96{
97 using enum unicode_break_opportunity;
98 using enum unicode_word_break_property;
99
100 hi_axiom(r.size() == infos.size() + 1);
101
102 r.front() = yes; // WB1
103 r.back() = yes; // WB2
104
105 for (auto i = 1_uz; i < infos.size(); ++i) {
106 hilet prev = infos[i - 1];
107 hilet next = infos[i];
108
109 r[i] = [&]() {
110 if (prev == CR and next == LF) {
111 return no; // WB3
112 } else if (prev == Newline or prev == CR or prev == LF) {
113 return yes; // WB3a
114 } else if (next == Newline or next == CR or next == LF) {
115 return yes; // WB3b
116 } else if (prev == ZWJ and next.is_pictographic()) {
117 return no; // WB3c
118 } else if (prev == WSegSpace and next == WSegSpace) {
119 return no; // WB3d
120 } else {
121 return unassigned;
122 }
123 }();
124 }
125}
126
127[[nodiscard]] inline void unicode_word_break_WB4(unicode_break_vector& r, std::vector<unicode_word_break_info>& infos) noexcept
128{
129 using enum unicode_break_opportunity;
130 using enum unicode_word_break_property;
131
132 hi_axiom(r.size() == infos.size() + 1);
133
134 for (auto i = 1_uz; i < infos.size(); ++i) {
135 hilet prev = infos[i - 1];
136 auto& next = infos[i];
137
138 if ((prev != Newline and prev != CR and prev != LF) and (next == Extend or next == Format or next == ZWJ)) {
139 if (r[i] == unassigned) {
140 r[i] = no;
141 }
142 next.make_skip();
143 }
144 }
145}
146
147[[nodiscard]] inline void
148unicode_word_break_WB5_WB999(unicode_break_vector& r, std::vector<unicode_word_break_info>& infos) noexcept
149{
150 using enum unicode_break_opportunity;
151 using enum unicode_word_break_property;
152
153 hi_axiom(r.size() == infos.size() + 1);
154
155 auto RI_count = 0_uz;
156 hilet size = narrow<std::ptrdiff_t>(infos.size());
157 for (auto i = 0_z; i < size; ++i) {
158 hilet& next = infos[i];
159 if (next == Regional_Indicator) {
160 ++RI_count;
161 } else {
162 RI_count = 0;
163 }
164
165 if (r[i] != unassigned) {
166 continue;
167 }
168
169 hi_axiom(not next.is_skip());
170
172
173 hilet prev = [&] {
174 for (k = i - 1; k >= 0; --k) {
175 if (not infos[k].is_skip()) {
176 return infos[k];
177 }
178 }
179 return unicode_word_break_info{};
180 }();
181
182 hilet prev_prev = [&] {
183 for (--k; k >= 0; --k) {
184 if (not infos[k].is_skip()) {
185 return infos[k];
186 }
187 }
188 return unicode_word_break_info{};
189 }();
190
191 hilet next_next = [&] {
192 for (k = i + 1; k < size; ++k) {
193 if (not infos[k].is_skip()) {
194 return infos[k];
195 }
196 }
197 return unicode_word_break_info{};
198 }();
199
200 r[i] = [&]() {
201 if (is_AHLetter(prev) and is_AHLetter(next)) {
202 return no; // WB5
203 } else if (is_AHLetter(prev) and (next == MidLetter or is_MidNumLetQ(next)) and is_AHLetter(next_next)) {
204 return no; // WB6
205 } else if (is_AHLetter(prev_prev) and (prev == MidLetter or is_MidNumLetQ(prev)) and is_AHLetter(next)) {
206 return no; // WB7
207 } else if (prev == Hebrew_Letter and next == Single_Quote) {
208 return no; // WB7a
209 } else if (prev == Hebrew_Letter and next == Double_Quote and next_next == Hebrew_Letter) {
210 return no; // WB7b
211 } else if (prev_prev == Hebrew_Letter and prev == Double_Quote and next == Hebrew_Letter) {
212 return no; // WB7c
213 } else if (prev == Numeric and next == Numeric) {
214 return no; // WB8
215 } else if (is_AHLetter(prev) and next == Numeric) {
216 return no; // WB9
217 } else if (prev == Numeric and is_AHLetter(next)) {
218 return no; // WB10
219 } else if (prev_prev == Numeric and (prev == MidNum or is_MidNumLetQ(prev)) and next == Numeric) {
220 return no; // WB11
221 } else if (prev == Numeric and (next == MidNum or is_MidNumLetQ(next)) and next_next == Numeric) {
222 return no; // WB12
223 } else if (prev == Katakana and next == Katakana) {
224 return no; // WB13
225 } else if (
226 (is_AHLetter(prev) or prev == Numeric or prev == Katakana or prev == ExtendNumLet) and next == ExtendNumLet) {
227 return no; // WB13a
228 } else if (prev == ExtendNumLet and (is_AHLetter(next) or next == Numeric or next == Katakana)) {
229 return no; // WB13b
230 } else if (prev == Regional_Indicator and next == Regional_Indicator and (RI_count % 2) == 1) {
231 return no; // WB15 WB16
232 } else {
233 return yes; // WB999
234 }
235 }();
236 }
237}
238
239} // namespace detail
240
248template<typename It, typename ItEnd, typename DescriptionFunc>
249[[nodiscard]] inline unicode_break_vector unicode_word_break(It first, ItEnd last, DescriptionFunc const& description_func)
250{
251 auto size = narrow<size_t>(std::distance(first, last));
252 auto r = unicode_break_vector{size + 1, unicode_break_opportunity::unassigned};
253
255 infos.reserve(size);
256 std::transform(first, last, std::back_inserter(infos), [&](hilet& item) {
257 hilet& description = description_func(item);
259 description.word_break_property(),
260 description.grapheme_cluster_break() == unicode_grapheme_cluster_break::Extended_Pictographic};
261 });
262
263 detail::unicode_word_break_WB1_WB3d(r, infos);
264 detail::unicode_word_break_WB4(r, infos);
265 detail::unicode_word_break_WB5_WB999(r, infos);
266 return r;
267}
268
269} // namespace hi::inline v1
#define hilet
Invariant should be the default for variables.
Definition utility.hpp:23
DOXYGEN BUG.
Definition algorithm.hpp:15
unicode_break_vector unicode_word_break(It first, ItEnd last, DescriptionFunc const &description_func)
The unicode word break algorithm UAX#29.
Definition unicode_word_break.hpp:249
Definition unicode_word_break.hpp:42
T back_inserter(T... args)
T distance(T... args)
T next(T... args)
T prev(T... args)
T reserve(T... args)
T transform(T... args)