4#include "TTauri/Foundation/required.hpp"
5#include "TTauri/Foundation/endian.hpp"
6#include "TTauri/Foundation/CP1252.hpp"
7#include "TTauri/Foundation/math.hpp"
15constexpr char32_t UnicodeASCIIEnd = 0x7f;
16constexpr char32_t UnicodePlane0End = 0xffff;
17constexpr char32_t UnicodePlane1Begin = 0x010000;
18constexpr char32_t UnicodePlane16End = 0x10ffff;
19constexpr char32_t UnicodeSurrogatesBegin = 0xd800;
20constexpr char32_t UnicodeSurrogatesEnd = 0xdfff;
21constexpr char32_t UnicodeHighSurrogatesBegin = 0xd800;
22constexpr char32_t UnicodeHighSurrogatesEnd = 0xdbff;
23constexpr char32_t UnicodeLowSurrogatesBegin = 0xdc00;
24constexpr char32_t UnicodeLowSurrogatesEnd = 0xdfff;
25constexpr char32_t UnicodeBasicMultilinqualPlaneEnd = UnicodePlane0End;
26constexpr char32_t UnicodeMax = UnicodePlane16End;
27constexpr char32_t UnicodeZeroWidthNoBreakSpace = 0xfeff;
28constexpr char32_t UnicodeBOM = UnicodeZeroWidthNoBreakSpace;
29constexpr char32_t UnicodeReplacementCharacter = 0xfffd;
30constexpr char32_t UnicodeNonCharacterFFFE = 0xfffe;
31constexpr char32_t UnicodeNonCharacterFFFF = 0xffff;
32constexpr char32_t UnicodeReverseBOM = UnicodeNonCharacterFFFE;
34[[nodiscard]]
inline std::u32string splitLigature(
char32_t x)
noexcept
37 case 0xfb00:
return { 0x0066, 0x0066 };
38 case 0xfb01:
return { 0x0066, 0x0069 };
39 case 0xfb02:
return { 0x0066, 0x006c };
40 case 0xfb03:
return { 0x0066, 0x0066, 0x0069 };
41 case 0xfb04:
return { 0x0066, 0x0066, 0x006c };
42 case 0xfb05:
return { 0x017f, 0x0074 };
43 case 0xfb06:
return { 0x0073, 0x0074 };
45 case 0xfb13:
return { 0x0574, 0x0576 };
46 case 0xfb14:
return { 0x0574, 0x0565 };
47 case 0xfb15:
return { 0x0574, 0x056b };
48 case 0xfb16:
return { 0x057e, 0x0576 };
49 case 0xfb17:
return { 0x0574, 0x056d };
55[[nodiscard]]
constexpr char32_t utf32_validate(
char32_t c)
noexcept
59 (c >= UnicodeSurrogatesBegin && c <= UnicodeSurrogatesEnd) ||
60 (c == UnicodeNonCharacterFFFE) ||
61 (c == UnicodeNonCharacterFFFF)
63 UnicodeReplacementCharacter :
67template<
typename UnaryOperation>
68[[nodiscard]]
inline std::u16string u16string_transform(std::u16string_view str, UnaryOperation unary_op)
noexcept
77[[nodiscard]]
inline std::u16string u16string_byte_swap(std::u16string_view str)
noexcept
79 return u16string_transform(str, [](ttlet &c) {
return byte_swap(c); });
82[[nodiscard]]
inline std::u16string u16string_little_to_native(std::u16string_view str)
noexcept
84 return u16string_transform(str, [](ttlet &c) {
return little_to_native(c); });
87[[nodiscard]]
inline std::u16string u16string_big_to_native(std::u16string_view str)
noexcept
89 return u16string_transform(str, [](ttlet &c) {
return big_to_native(c); });
99[[nodiscard]]
constexpr char16_t utf32_to_utf16(
char32_t c,
int &state)
noexcept
101 if (c >= UnicodePlane1Begin) {
102 c -= UnicodePlane1Begin;
106 return static_cast<char16_t>(UnicodeHighSurrogatesBegin + c);
110 return static_cast<char16_t>(UnicodeLowSurrogatesBegin + c);
115 return static_cast<char16_t>(c);
127[[nodiscard]]
constexpr char32_t utf16_to_utf32(
char16_t c, uint32_t &state)
noexcept
130 if (c >= UnicodeHighSurrogatesBegin && c <= UnicodeHighSurrogatesEnd) {
131 state =
static_cast<uint32_t
>(c - UnicodeHighSurrogatesBegin) << 18 | 1;
133 }
else if (c >= UnicodeLowSurrogatesBegin && c <= UnicodeLowSurrogatesEnd) {
134 return UnicodeReplacementCharacter;
136 return static_cast<char32_t>(c);
139 if (c >= UnicodeLowSurrogatesBegin && c <= UnicodeLowSurrogatesEnd) {
140 ttlet upper10bits =
static_cast<char32_t>(state >> 8);
141 ttlet lower10bits =
static_cast<char32_t>(c - UnicodeLowSurrogatesBegin);
143 return (upper10bits | lower10bits) + UnicodePlane1Begin;
146 return UnicodeReplacementCharacter;
158[[nodiscard]]
constexpr char utf32_to_utf8(
char32_t c,
int &state)
noexcept
163 return static_cast<char>(c);
164 }
else if (c <= 0x07ff) {
166 return static_cast<char>((c >> state) | 0xc0);
167 }
else if (c <= 0xffff) {
169 return static_cast<char>((c >> state) | 0xe0);
172 return static_cast<char>((c >> state) | 0xf0);
177 return static_cast<char>(((c >> state) & 0x3f) | 0x80);
181[[nodiscard]] tt_no_inline
char32_t utf8_to_utf32_fallback(
char c)
noexcept
183 return CP1252_to_UTF32(c);
204 auto c_ =
static_cast<uint8_t
>(c);
206 if (state.trailing_bytes) {
207 if ((c_ & 0xc0) == 0x80) {
208 --state.trailing_bytes;
210 state.code |= (c_ & 0x3f);
211 return state.trailing_bytes ? 0 : state.code;
214 state.trailing_bytes = 0;
215 return utf8_to_utf32_fallback(c_);
219 ttlet inv_c32 =
static_cast<uint32_t
>(
static_cast<uint8_t
>(~c_));
220 ttlet nr_data_bits = bsr(inv_c32);
222 state.trailing_bytes = 6 - nr_data_bits;
223 if (state.trailing_bytes < 0) {
225 state.trailing_bytes = 0;
228 }
else if (state.trailing_bytes > 0 && state.trailing_bytes <= 3) {
230 ttlet data_mask = (1 << nr_data_bits) - 1;
231 state.code = (c_ & data_mask);
237 state.trailing_bytes = 0;
238 return utf8_to_utf32_fallback(c_);
250 c = utf32_validate(c);
254 r += utf32_to_utf8(c, state);
263[[nodiscard]]
inline std::u16string to_u16string(std::u32string_view rhs)
noexcept {
268 c = utf32_validate(c);
272 r += utf32_to_utf16(c, state);
287 c = utf32_validate(c);
291 r +=
static_cast<wchar_t>(utf32_to_utf16(c, state));
305 r +=
static_cast<wchar_t>(c);
314[[nodiscard]]
inline std::u32string to_u32string(std::string_view rhs)
noexcept {
318 auto state = utf8_to_utf32_state{};
320 if (
auto c = utf8_to_utf32(u, state)) {
321 r += utf32_validate(c);
330[[nodiscard]]
inline std::u32string to_u32string(std::u16string_view rhs)
noexcept {
335 if (ssize(rhs) != 0 && rhs.front() == UnicodeReverseBOM) {
336 swapped_str = u16string_byte_swap(rhs);
337 rhs = std::u16string_view{swapped_str};
342 if (
auto c = utf16_to_utf32(u, state)) {
343 tt_assume(c <= UnicodeMax);
344 r += utf32_validate(c);
354[[nodiscard]]
inline std::u32string to_u32string(std::wstring_view rhs)
noexcept {
360 if (
auto c = utf16_to_utf32(
static_cast<char16_t>(u), state)) {
361 r += utf32_validate(c);
370[[nodiscard]]
inline std::u32string to_u32string(std::wstring_view rhs)
noexcept {
375 r +=
static_cast<wchar_t>(c);
396[[nodiscard]]
inline std::u16string to_u16string(std::string_view rhs)
noexcept {
397 return to_u16string(to_u32string(rhs));
Definition Unicode.hpp:186
T back_inserter(T... args)