10#include "unicode_grapheme_cluster_break.hpp"
11#include "unicode_break_opportunity.hpp"
16namespace hi::inline v1{
18 enum class unicode_word_break_property : uint8_t {
51 constexpr unicode_word_break_info(unicode_word_break_property
const &word_break_property,
bool pictographic) noexcept : _value(to_underlying(word_break_property) | (
static_cast<uint8_t
>(pictographic) << 7))
60 [[nodiscard]]
constexpr bool is_skip()
const noexcept
62 return static_cast<bool>(_value & 0x40);
65 [[nodiscard]]
constexpr bool is_pictographic()
const noexcept
67 return static_cast<bool>(_value & 0x80);
70 [[nodiscard]]
constexpr friend bool operator==(
unicode_word_break_info const &lhs, unicode_word_break_property
const &rhs)
noexcept
72 return (lhs._value & 0x3f) == to_underlying(rhs);
79 return rhs == unicode_word_break_property::ALetter or rhs == unicode_word_break_property::Hebrew_Letter;
84 return rhs == unicode_word_break_property::MidNumLet or rhs == unicode_word_break_property::Single_Quote;
91[[nodiscard]]
inline void unicode_word_break_WB1_WB3d(
95 using enum unicode_break_opportunity;
96 using enum unicode_word_break_property;
98 hi_axiom(r.size() == infos.size() + 1);
103 for (
auto i = 1_uz; i < infos.size(); ++i) {
104 hilet prev = infos[i - 1];
105 hilet next = infos[i];
108 if (prev == CR and next == LF) {
110 }
else if (prev == Newline or prev == CR or prev == LF) {
112 }
else if (next == Newline or next == CR or next == LF) {
114 }
else if (prev == ZWJ and
next.is_pictographic()) {
116 }
else if (prev == WSegSpace and next == WSegSpace) {
125[[nodiscard]]
inline void unicode_word_break_WB4(
126 unicode_break_vector &r,
129 using enum unicode_break_opportunity;
130 using enum unicode_word_break_property;
132 hi_axiom(r.size() == infos.size() + 1);
134 for (
auto i = 1_uz; i < infos.size(); ++i) {
136 auto &
next = infos[i];
138 if ((prev != Newline and prev != CR and prev != LF) and (next == Extend or next == Format or next == ZWJ)) {
139 if (r[i] == unassigned) {
147[[nodiscard]]
inline void unicode_word_break_WB5_WB999(
148 unicode_break_vector &r,
151 using enum unicode_break_opportunity;
152 using enum unicode_word_break_property;
154 hi_axiom(r.size() == infos.size() + 1);
156 auto RI_count = 0_uz;
157 hilet size = narrow<std::ptrdiff_t>(infos.size());
158 for (
auto i = 0_z; i < size; ++i) {
160 if (next == Regional_Indicator) {
166 if (r[i] != unassigned) {
170 hi_axiom(not
next.is_skip());
175 for (k = i - 1; k >= 0; --k) {
176 if (not infos[k].is_skip()) {
180 return unicode_word_break_info{};
183 hilet prev_prev = [&] {
184 for (--k; k >= 0; --k) {
185 if (not infos[k].is_skip()) {
189 return unicode_word_break_info{};
192 hilet next_next = [&] {
193 for (k = i + 1; k < size; ++k) {
194 if (not infos[k].is_skip()) {
198 return unicode_word_break_info{};
202 if (is_AHLetter(prev) and is_AHLetter(next)) {
204 }
else if (is_AHLetter(prev) and (next == MidLetter or is_MidNumLetQ(next)) and is_AHLetter(next_next)) {
206 }
else if (is_AHLetter(prev_prev) and (prev == MidLetter or is_MidNumLetQ(prev)) and is_AHLetter(next)) {
208 }
else if (prev == Hebrew_Letter and next == Single_Quote) {
210 }
else if (prev == Hebrew_Letter and next == Double_Quote and next_next == Hebrew_Letter) {
212 }
else if (prev_prev == Hebrew_Letter and prev == Double_Quote and next == Hebrew_Letter) {
214 }
else if (prev == Numeric and next == Numeric) {
216 }
else if (is_AHLetter(prev) and
next == Numeric) {
218 }
else if (prev == Numeric and is_AHLetter(next)) {
220 }
else if (prev_prev == Numeric and (prev == MidNum or is_MidNumLetQ(prev)) and
next == Numeric) {
222 }
else if (prev == Numeric and (next == MidNum or is_MidNumLetQ(next)) and next_next == Numeric) {
224 }
else if (prev == Katakana and next == Katakana) {
226 }
else if ((is_AHLetter(prev) or prev == Numeric or prev == Katakana or prev == ExtendNumLet) and
next == ExtendNumLet) {
228 }
else if (prev == ExtendNumLet and (is_AHLetter(next) or next == Numeric or next == Katakana)) {
230 }
else if (prev == Regional_Indicator and next == Regional_Indicator and (RI_count % 2) == 1) {
248template<
typename It,
typename ItEnd,
typename DescriptionFunc>
257 hilet &description = description_func(item);
258 return detail::unicode_word_break_info{description.word_break_property(), description.grapheme_cluster_break() == unicode_grapheme_cluster_break::Extended_Pictographic};
261 detail::unicode_word_break_WB1_WB3d(r, infos);
262 detail::unicode_word_break_WB4(r, infos);
263 detail::unicode_word_break_WB5_WB999(r, infos);
#define hilet
Invariant should be the default for variables.
Definition required.hpp:23
unicode_break_vector unicode_word_break(It first, ItEnd last, DescriptionFunc const &description_func)
The unicode word break algorithm UAX#29.
Definition unicode_word_break.hpp:249
Definition unicode_word_break.hpp:42
T back_inserter(T... args)