10#include "unicode_break_opportunity.hpp"
11#include "ucd_general_categories.hpp"
12#include "ucd_grapheme_cluster_breaks.hpp"
13#include "ucd_word_break_properties.hpp"
14#include "../utility/utility.hpp"
15#include "../macros.hpp"
33 constexpr unicode_word_break_info(unicode_word_break_property
const& word_break_property,
bool pictographic) noexcept :
34 _value(std::to_underlying(word_break_property) | (wide_cast<uint8_t>(pictographic) << 7))
44 [[nodiscard]]
constexpr bool is_skip()
const noexcept
46 return to_bool(_value & 0x40);
49 [[nodiscard]]
constexpr bool is_pictographic()
const noexcept
51 return to_bool(_value & 0x80);
54 [[nodiscard]]
constexpr friend bool
57 return (lhs._value & 0x3f) == std::to_underlying(rhs);
60 [[nodiscard]]
constexpr friend bool
65 return rhs == unicode_word_break_property::ALetter or rhs == unicode_word_break_property::Hebrew_Letter;
70 return rhs == unicode_word_break_property::MidNumLet or rhs == unicode_word_break_property::Single_Quote;
77[[nodiscard]]
inline void
80 using enum unicode_break_opportunity;
81 using enum unicode_word_break_property;
83 hi_axiom(r.size() == infos.size() + 1);
88 for (
auto i = 1_uz; i < infos.size(); ++i) {
89 hilet prev = infos[i - 1];
90 hilet next = infos[i];
93 if (prev == CR and next == LF) {
95 }
else if (prev == Newline or prev == CR or prev == LF) {
97 }
else if (next == Newline or next == CR or next == LF) {
99 }
else if (prev == ZWJ and
next.is_pictographic()) {
101 }
else if (prev == WSegSpace and next == WSegSpace) {
112 using enum unicode_break_opportunity;
113 using enum unicode_word_break_property;
115 hi_axiom(r.size() == infos.size() + 1);
117 for (
auto i = 1_uz; i < infos.size(); ++i) {
118 hilet
prev = infos[i - 1];
119 auto&
next = infos[i];
121 if ((prev != Newline and prev != CR and prev != LF) and (next == Extend or next == Format or next == ZWJ)) {
122 if (r[i] == unassigned) {
130[[nodiscard]]
inline void
133 using enum unicode_break_opportunity;
134 using enum unicode_word_break_property;
136 hi_axiom(r.size() == infos.size() + 1);
138 for (
auto i = 0_uz; i != infos.size(); ++i) {
139 if (r[i] != unassigned) {
143 hilet&
next = infos[i];
146 hi_axiom(not
next.is_skip());
148 auto prev_i = narrow_cast<ptrdiff_t>(i) - 1;
149 auto prev = unicode_word_break_info{};
150 for (; prev_i >= 0 ; --prev_i) {
151 if (not infos[prev_i].is_skip()) {
152 prev = infos[prev_i];
157 auto prev_prev_i = prev_i - 1;
158 auto prev_prev = unicode_word_break_info{};
159 for (; prev_prev_i >= 0; --prev_prev_i) {
160 if (not infos[prev_prev_i].is_skip()) {
161 prev_prev = infos[prev_prev_i];
166 auto next_next_i = i + 1;
167 auto next_next = unicode_word_break_info{};
168 for (; next_next_i != infos.size(); ++next_next_i) {
169 if (not infos[next_next_i].is_skip()) {
170 next_next = infos[next_next_i];
175 auto RI_i = prev_i - 1;
176 auto RI_is_pair =
true;
177 if (prev == Regional_Indicator and next == Regional_Indicator) {
179 for (; RI_i >= 0; --RI_i) {
180 if (infos[RI_i].is_skip()) {
182 }
else if (infos[RI_i] != Regional_Indicator) {
185 RI_is_pair = not RI_is_pair;
190 if (is_AHLetter(prev) and is_AHLetter(next)) {
192 }
else if (is_AHLetter(prev) and (next == MidLetter or is_MidNumLetQ(next)) and is_AHLetter(next_next)) {
194 }
else if (is_AHLetter(prev_prev) and (prev == MidLetter or is_MidNumLetQ(prev)) and is_AHLetter(next)) {
196 }
else if (prev == Hebrew_Letter and next == Single_Quote) {
198 }
else if (prev == Hebrew_Letter and next == Double_Quote and next_next == Hebrew_Letter) {
200 }
else if (prev_prev == Hebrew_Letter and prev == Double_Quote and next == Hebrew_Letter) {
202 }
else if (prev == Numeric and next == Numeric) {
204 }
else if (is_AHLetter(prev) and next == Numeric) {
206 }
else if (prev == Numeric and is_AHLetter(next)) {
208 }
else if (prev_prev == Numeric and (prev == MidNum or is_MidNumLetQ(prev)) and next == Numeric) {
210 }
else if (prev == Numeric and (next == MidNum or is_MidNumLetQ(next)) and next_next == Numeric) {
212 }
else if (prev == Katakana and next == Katakana) {
215 (is_AHLetter(prev) or prev == Numeric or prev == Katakana or prev == ExtendNumLet) and next == ExtendNumLet) {
217 }
else if (prev == ExtendNumLet and (is_AHLetter(next) or next == Numeric or next == Katakana)) {
219 }
else if (prev == Regional_Indicator and next == Regional_Indicator and RI_is_pair) {
237template<
typename It,
typename ItEnd,
typename CodePo
intFunc>
253 detail::unicode_word_break_WB1_WB3d(r,
infos);
254 detail::unicode_word_break_WB4(r,
infos);
255 detail::unicode_word_break_WB5_WB999(r,
infos);
276 using enum unicode_general_category;
282 for (
auto it = first;
it != last; ++
it) {
284 hilet general_category = ucd_get_general_category(code_point);
286 if (general_category == Zp || general_category == Zl) {
293 }
else if (general_category == Zs) {
DOXYGEN BUG.
Definition algorithm.hpp:16
void wrap_lines(auto first, auto last, float max_width, auto get_width, auto get_code_point, auto set_code_point) noexcept
Wrap lines in text that are too wide.
Definition unicode_word_break.hpp:274
unicode_break_vector unicode_word_break(It first, ItEnd last, CodePointFunc const &code_point_func) noexcept
The unicode word break algorithm UAX#29.
Definition unicode_word_break.hpp:238
constexpr Out narrow_cast(In const &rhs) noexcept
Cast numeric values without loss of precision.
Definition cast.hpp:377
Definition unicode_word_break.hpp:25
T back_inserter(T... args)