HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
unicode_word_break.hpp
Go to the documentation of this file.
1// Copyright Take Vos 2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
8#pragma once
9
10#include "unicode_grapheme_cluster_break.hpp"
11#include "unicode_break_opportunity.hpp"
12#include "../cast.hpp"
13#include <algorithm>
14#include <vector>
15
16namespace hi::inline v1{
17
18 enum class unicode_word_break_property : uint8_t {
19 Other,
20 CR,
21 LF,
22 Newline,
23 Extend,
24 ZWJ,
25 Regional_Indicator,
26 Format,
27 Katakana,
28 Hebrew_Letter,
29 ALetter,
30 Single_Quote,
31 Double_Quote,
32 MidNumLet,
33 MidLetter,
34 MidNum,
35 Numeric,
36 ExtendNumLet,
37 WSegSpace
38};
39
40namespace detail {
41
43public:
44 constexpr unicode_word_break_info() noexcept : _value(0)
45 {}
46 constexpr unicode_word_break_info(unicode_word_break_info const &) noexcept = default;
47 constexpr unicode_word_break_info(unicode_word_break_info &&) noexcept = default;
48 constexpr unicode_word_break_info &operator=(unicode_word_break_info const &) noexcept = default;
49 constexpr unicode_word_break_info &operator=(unicode_word_break_info &&) noexcept = default;
50
51 constexpr unicode_word_break_info(unicode_word_break_property const &word_break_property, bool pictographic) noexcept : _value(to_underlying(word_break_property) | (static_cast<uint8_t>(pictographic) << 7))
52 {}
53
54 constexpr unicode_word_break_info &make_skip() noexcept
55 {
56 _value |= 0x40;
57 return *this;
58 }
59
60 [[nodiscard]] constexpr bool is_skip() const noexcept
61 {
62 return static_cast<bool>(_value & 0x40);
63 }
64
65 [[nodiscard]] constexpr bool is_pictographic() const noexcept
66 {
67 return static_cast<bool>(_value & 0x80);
68 }
69
70 [[nodiscard]] constexpr friend bool operator==(unicode_word_break_info const &lhs, unicode_word_break_property const &rhs) noexcept
71 {
72 return (lhs._value & 0x3f) == to_underlying(rhs);
73 }
74
75 [[nodiscard]] constexpr friend bool operator==(unicode_word_break_info const &, unicode_word_break_info const &) noexcept = default;
76
77 [[nodiscard]] constexpr friend bool is_AHLetter(unicode_word_break_info const &rhs) noexcept
78 {
79 return rhs == unicode_word_break_property::ALetter or rhs == unicode_word_break_property::Hebrew_Letter;
80 }
81
82 [[nodiscard]] constexpr friend bool is_MidNumLetQ(unicode_word_break_info const &rhs) noexcept
83 {
84 return rhs == unicode_word_break_property::MidNumLet or rhs == unicode_word_break_property::Single_Quote;
85 }
86
87private:
88 uint8_t _value;
89};
90
91[[nodiscard]] inline void unicode_word_break_WB1_WB3d(
94{
95 using enum unicode_break_opportunity;
96 using enum unicode_word_break_property;
97
98 hi_axiom(r.size() == infos.size() + 1);
99
100 r.front() = yes; // WB1
101 r.back() = yes; // WB2
102
103 for (auto i = 1_uz; i < infos.size(); ++i) {
104 hilet prev = infos[i - 1];
105 hilet next = infos[i];
106
107 r[i] = [&] () {
108 if (prev == CR and next == LF) {
109 return no; // WB3
110 } else if (prev == Newline or prev == CR or prev == LF) {
111 return yes; // WB3a
112 } else if (next == Newline or next == CR or next == LF) {
113 return yes; // WB3b
114 } else if (prev == ZWJ and next.is_pictographic()) {
115 return no; // WB3c
116 } else if (prev == WSegSpace and next == WSegSpace) {
117 return no; // WB3d
118 } else {
119 return unassigned;
120 }
121 }();
122 }
123}
124
125[[nodiscard]] inline void unicode_word_break_WB4(
126 unicode_break_vector &r,
128{
129 using enum unicode_break_opportunity;
130 using enum unicode_word_break_property;
131
132 hi_axiom(r.size() == infos.size() + 1);
133
134 for (auto i = 1_uz; i < infos.size(); ++i) {
135 hilet prev = infos[i - 1];
136 auto &next = infos[i];
137
138 if ((prev != Newline and prev != CR and prev != LF) and (next == Extend or next == Format or next == ZWJ)) {
139 if (r[i] == unassigned) {
140 r[i] = no;
141 }
142 next.make_skip();
143 }
144 }
145}
146
147[[nodiscard]] inline void unicode_word_break_WB5_WB999(
148 unicode_break_vector &r,
150{
151 using enum unicode_break_opportunity;
152 using enum unicode_word_break_property;
153
154 hi_axiom(r.size() == infos.size() + 1);
155
156 auto RI_count = 0_uz;
157 hilet size = narrow<std::ptrdiff_t>(infos.size());
158 for (auto i = 0_z; i < size; ++i) {
159 hilet &next = infos[i];
160 if (next == Regional_Indicator) {
161 ++RI_count;
162 } else {
163 RI_count = 0;
164 }
165
166 if (r[i] != unassigned) {
167 continue;
168 }
169
170 hi_axiom(not next.is_skip());
171
173
174 hilet prev = [&] {
175 for (k = i - 1; k >= 0; --k) {
176 if (not infos[k].is_skip()) {
177 return infos[k];
178 }
179 }
180 return unicode_word_break_info{};
181 }();
182
183 hilet prev_prev = [&] {
184 for (--k; k >= 0; --k) {
185 if (not infos[k].is_skip()) {
186 return infos[k];
187 }
188 }
189 return unicode_word_break_info{};
190 }();
191
192 hilet next_next = [&] {
193 for (k = i + 1; k < size; ++k) {
194 if (not infos[k].is_skip()) {
195 return infos[k];
196 }
197 }
198 return unicode_word_break_info{};
199 }();
200
201 r[i] = [&] () {
202 if (is_AHLetter(prev) and is_AHLetter(next)) {
203 return no; // WB5
204 } else if (is_AHLetter(prev) and (next == MidLetter or is_MidNumLetQ(next)) and is_AHLetter(next_next)) {
205 return no; // WB6
206 } else if (is_AHLetter(prev_prev) and (prev == MidLetter or is_MidNumLetQ(prev)) and is_AHLetter(next)) {
207 return no; // WB7
208 } else if (prev == Hebrew_Letter and next == Single_Quote) {
209 return no; // WB7a
210 } else if (prev == Hebrew_Letter and next == Double_Quote and next_next == Hebrew_Letter) {
211 return no; // WB7b
212 } else if (prev_prev == Hebrew_Letter and prev == Double_Quote and next == Hebrew_Letter) {
213 return no; // WB7c
214 } else if (prev == Numeric and next == Numeric) {
215 return no; // WB8
216 } else if (is_AHLetter(prev) and next == Numeric) {
217 return no; // WB9
218 } else if (prev == Numeric and is_AHLetter(next)) {
219 return no; // WB10
220 } else if (prev_prev == Numeric and (prev == MidNum or is_MidNumLetQ(prev)) and next == Numeric) {
221 return no; // WB11
222 } else if (prev == Numeric and (next == MidNum or is_MidNumLetQ(next)) and next_next == Numeric) {
223 return no; // WB12
224 } else if (prev == Katakana and next == Katakana) {
225 return no; // WB13
226 } else if ((is_AHLetter(prev) or prev == Numeric or prev == Katakana or prev == ExtendNumLet) and next == ExtendNumLet) {
227 return no; // WB13a
228 } else if (prev == ExtendNumLet and (is_AHLetter(next) or next == Numeric or next == Katakana)) {
229 return no; // WB13b
230 } else if (prev == Regional_Indicator and next == Regional_Indicator and (RI_count % 2) == 1) {
231 return no; // WB15 WB16
232 } else {
233 return yes; // WB999
234 }
235 }();
236 }
237}
238
239}
240
248template<typename It, typename ItEnd, typename DescriptionFunc>
249[[nodiscard]] inline unicode_break_vector unicode_word_break(It first, ItEnd last, DescriptionFunc const &description_func)
250{
251 auto size = narrow<size_t>(std::distance(first, last));
252 auto r = unicode_break_vector{size + 1, unicode_break_opportunity::unassigned};
253
255 infos.reserve(size);
256 std::transform(first, last, std::back_inserter(infos), [&] (hilet &item) {
257 hilet &description = description_func(item);
258 return detail::unicode_word_break_info{description.word_break_property(), description.grapheme_cluster_break() == unicode_grapheme_cluster_break::Extended_Pictographic};
259 });
260
261 detail::unicode_word_break_WB1_WB3d(r, infos);
262 detail::unicode_word_break_WB4(r, infos);
263 detail::unicode_word_break_WB5_WB999(r, infos);
264 return r;
265}
266
267}
#define hilet
Invariant should be the default for variables.
Definition required.hpp:23
unicode_break_vector unicode_word_break(It first, ItEnd last, DescriptionFunc const &description_func)
The unicode word break algorithm UAX#29.
Definition unicode_word_break.hpp:249
Definition unicode_word_break.hpp:42
T back_inserter(T... args)
T distance(T... args)
T next(T... args)
T prev(T... args)
T reserve(T... args)
T transform(T... args)