10#include "unicode_grapheme_cluster_break.hpp"
11#include "unicode_break_opportunity.hpp"
12#include "../utility/module.hpp"
16namespace hi::inline
v1 {
18enum class unicode_word_break_property : uint8_t {
50 constexpr unicode_word_break_info(unicode_word_break_property
const& word_break_property,
bool pictographic) noexcept :
51 _value(to_underlying(word_break_property) | (wide_cast<uint8_t>(pictographic) << 7))
61 [[nodiscard]]
constexpr bool is_skip()
const noexcept
63 return to_bool(_value & 0x40);
66 [[nodiscard]]
constexpr bool is_pictographic()
const noexcept
68 return to_bool(_value & 0x80);
71 [[nodiscard]]
constexpr friend bool
74 return (lhs._value & 0x3f) == to_underlying(rhs);
77 [[nodiscard]]
constexpr friend bool
82 return rhs == unicode_word_break_property::ALetter or rhs == unicode_word_break_property::Hebrew_Letter;
87 return rhs == unicode_word_break_property::MidNumLet or rhs == unicode_word_break_property::Single_Quote;
94[[nodiscard]]
inline void
97 using enum unicode_break_opportunity;
98 using enum unicode_word_break_property;
100 hi_axiom(r.size() == infos.size() + 1);
105 for (
auto i = 1_uz; i < infos.size(); ++i) {
106 hilet prev = infos[i - 1];
107 hilet next = infos[i];
110 if (prev == CR and next == LF) {
112 }
else if (prev == Newline or prev == CR or prev == LF) {
114 }
else if (next == Newline or next == CR or next == LF) {
116 }
else if (prev == ZWJ and
next.is_pictographic()) {
118 }
else if (prev == WSegSpace and next == WSegSpace) {
129 using enum unicode_break_opportunity;
130 using enum unicode_word_break_property;
132 hi_axiom(r.size() == infos.size() + 1);
134 for (
auto i = 1_uz; i < infos.size(); ++i) {
136 auto&
next = infos[i];
138 if ((prev != Newline and prev != CR and prev != LF) and (next == Extend or next == Format or next == ZWJ)) {
139 if (r[i] == unassigned) {
147[[nodiscard]]
inline void
150 using enum unicode_break_opportunity;
151 using enum unicode_word_break_property;
153 hi_axiom(r.size() == infos.size() + 1);
155 for (
auto i = 0_uz; i != infos.size(); ++i) {
156 if (r[i] != unassigned) {
165 auto prev_i = narrow_cast<ptrdiff_t>(i) - 1;
166 auto prev = unicode_word_break_info{};
167 for (; prev_i >= 0 ; --prev_i) {
168 if (not infos[prev_i].is_skip()) {
169 prev = infos[prev_i];
174 auto prev_prev_i = prev_i - 1;
175 auto prev_prev = unicode_word_break_info{};
176 for (; prev_prev_i >= 0; --prev_prev_i) {
177 if (not infos[prev_prev_i].is_skip()) {
178 prev_prev = infos[prev_prev_i];
183 auto next_next_i = i + 1;
184 auto next_next = unicode_word_break_info{};
185 for (; next_next_i != infos.size(); ++next_next_i) {
186 if (not infos[next_next_i].is_skip()) {
187 next_next = infos[next_next_i];
192 auto RI_i = prev_i - 1;
193 auto RI_is_pair =
true;
194 if (prev == Regional_Indicator and next == Regional_Indicator) {
196 for (; RI_i >= 0; --RI_i) {
197 if (infos[RI_i].is_skip()) {
199 }
else if (infos[RI_i] != Regional_Indicator) {
202 RI_is_pair = not RI_is_pair;
207 if (is_AHLetter(prev) and is_AHLetter(next)) {
209 }
else if (is_AHLetter(prev) and (next == MidLetter or is_MidNumLetQ(next)) and is_AHLetter(next_next)) {
211 }
else if (is_AHLetter(prev_prev) and (prev == MidLetter or is_MidNumLetQ(prev)) and is_AHLetter(next)) {
213 }
else if (prev == Hebrew_Letter and next == Single_Quote) {
215 }
else if (prev == Hebrew_Letter and next == Double_Quote and next_next == Hebrew_Letter) {
217 }
else if (prev_prev == Hebrew_Letter and prev == Double_Quote and next == Hebrew_Letter) {
219 }
else if (prev == Numeric and next == Numeric) {
221 }
else if (is_AHLetter(prev) and next == Numeric) {
223 }
else if (prev == Numeric and is_AHLetter(next)) {
225 }
else if (prev_prev == Numeric and (prev == MidNum or is_MidNumLetQ(prev)) and next == Numeric) {
227 }
else if (prev == Numeric and (next == MidNum or is_MidNumLetQ(next)) and next_next == Numeric) {
229 }
else if (prev == Katakana and next == Katakana) {
232 (is_AHLetter(prev) or prev == Numeric or prev == Katakana or prev == ExtendNumLet) and next == ExtendNumLet) {
234 }
else if (prev == ExtendNumLet and (is_AHLetter(next) or next == Numeric or next == Katakana)) {
236 }
else if (prev == Regional_Indicator and next == Regional_Indicator and RI_is_pair) {
254template<
typename It,
typename ItEnd,
typename DescriptionFunc>
255[[nodiscard]]
inline unicode_break_vector
264 hilet& description = description_func(item);
266 description.word_break_property(),
267 description.grapheme_cluster_break() == unicode_grapheme_cluster_break::Extended_Pictographic};
270 detail::unicode_word_break_WB1_WB3d(r, infos);
271 detail::unicode_word_break_WB4(r, infos);
272 detail::unicode_word_break_WB5_WB999(r, infos);
#define hi_axiom(expression,...)
Specify an axiom; an expression that is true.
Definition assert.hpp:238
#define hilet
Invariant should be the default for variables.
Definition utility.hpp:23
DOXYGEN BUG.
Definition algorithm.hpp:13
unicode_break_vector unicode_word_break(It first, ItEnd last, DescriptionFunc const &description_func) noexcept
The unicode word break algorithm UAX#29.
Definition unicode_word_break.hpp:256
Definition unicode_word_break.hpp:42
T back_inserter(T... args)