10#include "unicode_grapheme_cluster_break.hpp"
11#include "unicode_break_opportunity.hpp"
16namespace hi::inline
v1 {
18enum class unicode_word_break_property : uint8_t {
50 constexpr unicode_word_break_info(unicode_word_break_property
const& word_break_property,
bool pictographic) noexcept :
51 _value(to_underlying(word_break_property) | (
static_cast<uint8_t
>(pictographic) << 7))
61 [[nodiscard]]
constexpr bool is_skip()
const noexcept
63 return to_bool(_value & 0x40);
66 [[nodiscard]]
constexpr bool is_pictographic()
const noexcept
68 return to_bool(_value & 0x80);
71 [[nodiscard]]
constexpr friend bool
74 return (lhs._value & 0x3f) == to_underlying(rhs);
77 [[nodiscard]]
constexpr friend bool
82 return rhs == unicode_word_break_property::ALetter or rhs == unicode_word_break_property::Hebrew_Letter;
87 return rhs == unicode_word_break_property::MidNumLet or rhs == unicode_word_break_property::Single_Quote;
94[[nodiscard]]
inline void
97 using enum unicode_break_opportunity;
98 using enum unicode_word_break_property;
100 hi_axiom(r.size() == infos.size() + 1);
105 for (
auto i = 1_uz; i < infos.size(); ++i) {
106 hilet prev = infos[i - 1];
107 hilet next = infos[i];
110 if (prev == CR and next == LF) {
112 }
else if (prev == Newline or prev == CR or prev == LF) {
114 }
else if (next == Newline or next == CR or next == LF) {
116 }
else if (prev == ZWJ and
next.is_pictographic()) {
118 }
else if (prev == WSegSpace and next == WSegSpace) {
129 using enum unicode_break_opportunity;
130 using enum unicode_word_break_property;
132 hi_axiom(r.size() == infos.size() + 1);
134 for (
auto i = 1_uz; i < infos.size(); ++i) {
136 auto&
next = infos[i];
138 if ((prev != Newline and prev != CR and prev != LF) and (next == Extend or next == Format or next == ZWJ)) {
139 if (r[i] == unassigned) {
147[[nodiscard]]
inline void
150 using enum unicode_break_opportunity;
151 using enum unicode_word_break_property;
153 hi_axiom(r.size() == infos.size() + 1);
155 auto RI_count = 0_uz;
156 hilet size = narrow_cast<std::ptrdiff_t>(infos.size());
157 for (
auto i = 0_z; i < size; ++i) {
159 if (next == Regional_Indicator) {
165 if (r[i] != unassigned) {
174 for (k = i - 1; k >= 0; --k) {
175 if (not infos[k].is_skip()) {
179 return unicode_word_break_info{};
182 hilet prev_prev = [&] {
183 for (--k; k >= 0; --k) {
184 if (not infos[k].is_skip()) {
188 return unicode_word_break_info{};
191 hilet next_next = [&] {
192 for (k = i + 1; k < size; ++k) {
193 if (not infos[k].is_skip()) {
197 return unicode_word_break_info{};
201 if (is_AHLetter(prev) and is_AHLetter(next)) {
203 }
else if (is_AHLetter(prev) and (next == MidLetter or is_MidNumLetQ(next)) and is_AHLetter(next_next)) {
205 }
else if (is_AHLetter(prev_prev) and (prev == MidLetter or is_MidNumLetQ(prev)) and is_AHLetter(next)) {
207 }
else if (prev == Hebrew_Letter and next == Single_Quote) {
209 }
else if (prev == Hebrew_Letter and next == Double_Quote and next_next == Hebrew_Letter) {
211 }
else if (prev_prev == Hebrew_Letter and prev == Double_Quote and next == Hebrew_Letter) {
213 }
else if (prev == Numeric and next == Numeric) {
215 }
else if (is_AHLetter(prev) and
next == Numeric) {
217 }
else if (prev == Numeric and is_AHLetter(next)) {
219 }
else if (prev_prev == Numeric and (prev == MidNum or is_MidNumLetQ(prev)) and
next == Numeric) {
221 }
else if (prev == Numeric and (next == MidNum or is_MidNumLetQ(next)) and next_next == Numeric) {
223 }
else if (prev == Katakana and next == Katakana) {
226 (is_AHLetter(prev) or prev == Numeric or prev == Katakana or prev == ExtendNumLet) and
next == ExtendNumLet) {
228 }
else if (prev == ExtendNumLet and (is_AHLetter(next) or next == Numeric or next == Katakana)) {
230 }
else if (prev == Regional_Indicator and next == Regional_Indicator and (RI_count % 2) == 1) {
248template<
typename It,
typename ItEnd,
typename DescriptionFunc>
257 hilet& description = description_func(item);
259 description.word_break_property(),
260 description.grapheme_cluster_break() == unicode_grapheme_cluster_break::Extended_Pictographic};
263 detail::unicode_word_break_WB1_WB3d(r, infos);
264 detail::unicode_word_break_WB4(r, infos);
265 detail::unicode_word_break_WB5_WB999(r, infos);
#define hi_axiom(expression,...)
Specify an axiom; an expression that is true.
Definition assert.hpp:133
#define hilet
Invariant should be the default for variables.
Definition utility.hpp:23
DOXYGEN BUG.
Definition algorithm.hpp:15
unicode_break_vector unicode_word_break(It first, ItEnd last, DescriptionFunc const &description_func) noexcept
The unicode word break algorithm UAX#29.
Definition unicode_word_break.hpp:249
Definition unicode_word_break.hpp:42
T back_inserter(T... args)