10#include "ucd_sentence_break_properties.hpp"
11#include "../macros.hpp"
27 constexpr unicode_sentence_break_info(unicode_sentence_break_property
const &sentence_break_property) noexcept : _value(std::to_underlying(sentence_break_property))
36 [[nodiscard]]
constexpr bool is_skip()
const noexcept
38 return to_bool(_value & 0x40);
41 [[nodiscard]]
constexpr friend bool operator==(
unicode_sentence_break_info const &lhs, unicode_sentence_break_property
const &rhs)
noexcept
43 return (lhs._value & 0x3f) == std::to_underlying(rhs);
50 return rhs == unicode_sentence_break_property::Sep or rhs == unicode_sentence_break_property::CR or rhs == unicode_sentence_break_property::LF;
55 return rhs == unicode_sentence_break_property::STerm or rhs == unicode_sentence_break_property::ATerm;
62[[nodiscard]]
inline void unicode_sentence_break_SB1_SB4(
66 using enum unicode_break_opportunity;
67 using enum unicode_sentence_break_property;
69 hi_axiom(r.size() == infos.size() + 1);
74 for (
auto i = 1_uz; i < infos.size(); ++i) {
75 hilet prev = infos[i - 1];
76 hilet next = infos[i];
79 if (prev == CR and next == LF) {
81 }
else if (is_ParaSep(prev)) {
90[[nodiscard]]
inline void unicode_sentence_break_SB5(
91 unicode_break_vector &r,
94 using enum unicode_break_opportunity;
95 using enum unicode_sentence_break_property;
97 hi_axiom(r.size() == infos.size() + 1);
99 for (
auto i = 1_uz; i < infos.size(); ++i) {
100 hilet
prev = infos[i - 1];
101 auto &
next = infos[i];
103 if ((not is_ParaSep(prev) and prev != CR and prev != LF) and (next == Extend or next == Format)) {
104 if (r[i] == unassigned) {
112[[nodiscard]]
inline void unicode_sentence_break_SB6_SB998(
113 unicode_break_vector &r,
116 using enum unicode_break_opportunity;
117 using enum unicode_sentence_break_property;
119 hi_axiom(r.size() == infos.size() + 1);
121 for (
auto i = 0_z; i < std::ssize(infos); ++i) {
122 hilet &
next = infos[i];
123 if (r[i] != unassigned) {
127 hi_axiom(not
next.is_skip());
132 for (k = i - 1; k >= 0; --k) {
133 if (not infos[k].is_skip()) {
137 return unicode_sentence_break_info{};
140 hilet prev_prev = [&] {
141 for (--k; k >= 0; --k) {
142 if (not infos[k].is_skip()) {
146 return unicode_sentence_break_info{};
154 hilet [prefix, close_sp_par_found] = [&]() {
155 using enum unicode_break_opportunity;
159 for (
auto j = i - 1; j >= 0; --j) {
160 if (not infos[j].is_skip()) {
163 if (is_ParaSep(infos[j])) {
166 }
else if (infos[j] == Sp) {
169 }
else if (infos[j] == Close) {
178 if (infos[j] == Sp) {
181 }
else if (infos[j] == Close) {
189 if (infos[j] == Close) {
201 hilet optional_close = (close_sp_par_found & 3) == 0;
202 hilet optional_close_sp = (close_sp_par_found & 1) == 0;
203 hilet optional_close_sp_par =
true;
205 hilet end_in_lower = [&]{
206 for (
auto j = i; j < std::ssize(infos); ++j) {
207 if (not infos[j].is_skip()) {
208 if (infos[j] == Lower) {
210 }
else if (infos[j] == OLetter or infos[j] == Upper or is_ParaSep(infos[j]) or is_SATerm(infos[j])) {
219 if (prev == ATerm and next == Numeric) {
221 }
else if ((prev_prev == Upper or prev_prev == Lower) and
prev == ATerm and
next == Upper) {
223 }
else if (prefix == ATerm and optional_close_sp and end_in_lower) {
225 }
else if (is_SATerm(prefix) and optional_close_sp and (next == SContinue or is_SATerm(next))) {
227 }
else if (is_SATerm(prefix) and optional_close and (next == Close or next == Sp or is_ParaSep(next))) {
229 }
else if (is_SATerm(prefix) and optional_close_sp and (next == Sp or is_ParaSep(next))) {
231 }
else if (is_SATerm(prefix) and optional_close_sp_par) {
249template<
typename It,
typename ItEnd,
typename CodePo
intFunc>
263 detail::unicode_sentence_break_SB1_SB4(r,
infos);
264 detail::unicode_sentence_break_SB5(r,
infos);
265 detail::unicode_sentence_break_SB6_SB998(r,
infos);
DOXYGEN BUG.
Definition algorithm.hpp:16
unicode_break_vector unicode_sentence_break(It first, ItEnd last, CodePointFunc const &code_point_func) noexcept
The unicode word break algorithm UAX#29.
Definition unicode_sentence_break.hpp:251
constexpr Out narrow_cast(In const &rhs) noexcept
Cast numeric values without loss of precision.
Definition cast.hpp:377
Definition unicode_sentence_break.hpp:18
T back_inserter(T... args)