7#include "../required.hpp"
8#include "../endian.hpp"
9#include "../CP1252.hpp"
26template<
typename Iterator>
27[[nodiscard]]
constexpr char32_t utf16_to_utf32(Iterator &it)
noexcept
30 static_assert(std::is_same_v<value_type, char16_t>,
"Iterator must point to a char16_t");
32 ttlet first = *(it++);
33 if (first <= 0xd7ff || first >= 0xe000) {
37 tt_axiom(first <= 0xdbff,
"Expecting the high surrogate");
38 ttlet second = *(it++);
39 tt_axiom(second >= 0xdc00 && second <= 0xdfff,
"Expecting the low surrogate");
41 return ((
static_cast<char32_t>(first - 0xd800) << 10) | (
static_cast<char32_t>(second - 0xdc00))) + 0x01'0000;
55template<
typename Iterator>
56[[nodiscard]]
constexpr char32_t utf8_to_utf32(Iterator &it)
noexcept
59 static_assert(std::is_same_v<value_type, char8_t>,
"Iterator must point to a char8_t");
61 auto cp =
char32_t{0};
65 return static_cast<char32_t>(cu);
67 }
else if (cu <= 0xdf) {
68 tt_axiom(cu >= 0xc0,
"UTF-8 encoded code-point can not start with continuation code-units");
69 cp =
static_cast<char32_t>(cu & 0x1f);
71 cp |=
static_cast<char32_t>(*(it++) & 0x3f);
72 tt_axiom(cp >= 0x0080 && cp <= 0x07ff,
"UTF-8 Overlong encoding");
75 }
else if (cu <= 0xef) {
76 cp =
static_cast<char32_t>(cu & 0x0f);
78 cp |=
static_cast<char32_t>(*(it++) & 0x3f);
80 cp |=
static_cast<char32_t>(*(it++) & 0x3f);
81 tt_axiom(cp >= 0x0800 && cp <= 0xffff,
"UTF-8 Overlong encoding");
82 tt_axiom(!(cp >= 0xd800 && cp <= 0xdfff),
"UTF-8 Must not encode surrogates");
86 tt_axiom(cu <= 0xf7,
"UTF8 encoded code-point must have a valid start code-unit");
87 cp =
static_cast<char32_t>(cu & 0x07);
89 cp |=
static_cast<char32_t>(*(it++) & 0x3f);
91 cp |=
static_cast<char32_t>(*(it++) & 0x3f);
93 cp |=
static_cast<char32_t>(*(it++) & 0x3f);
94 tt_axiom(cp >= 0x100000 && cp <= 0x10ffff,
"UTF-8 Overlong encoding");
111template<
typename Iterator>
112constexpr bool utf8_to_utf32(Iterator &it, Iterator last,
char32_t &code_point)
noexcept
115 static_assert(std::is_same_v<value_type, char8_t>,
"Iterator must point to a char8_t");
117 auto continuation_count = 0;
118 auto first_cu = *(it++);
119 if (first_cu <= 0x7f) {
120 code_point = first_cu;
123 }
else if (first_cu <= 0xbf) {
125 code_point = CP1252_to_UTF32(
static_cast<char>(first_cu));
128 }
else if (first_cu <= 0xdf) {
129 code_point =
static_cast<char32_t>(first_cu & 0x1f);
130 continuation_count = 1;
132 }
else if (first_cu <= 0xef) {
133 code_point =
static_cast<char32_t>(first_cu & 0x0f);
134 continuation_count = 2;
136 }
else if (first_cu <= 0xf7) {
137 code_point =
static_cast<char32_t>(first_cu & 0x07);
138 continuation_count = 3;
142 code_point = CP1252_to_UTF32(
static_cast<char>(first_cu));
147 for (
int i = 0; i != continuation_count; ++i) {
148 if (it == last || (*it & 0xc0) != 0x80) {
150 [[unlikely]] code_point = CP1252_to_UTF32(
static_cast<char>(first_cu));
156 code_point = *(it++) & 0x3f;
159 if ((code_point >= 0xd800 && code_point <= 0xdfff) ||
160 (continuation_count == 1 && code_point < 0x0080) ||
161 (continuation_count == 2 && code_point < 0x0800) ||
162 (continuation_count == 3 && code_point < 0x10000)
165 code_point = CP1252_to_UTF32(
static_cast<char>(first_cu));
181template<
typename BackInsertIterator>
182constexpr void utf32_to_utf16(
char32_t code_point, BackInsertIterator &it)
noexcept
184 using value_type =
typename BackInsertIterator::container_type::value_type;
185 static_assert(
sizeof(value_type) == 2,
"Iterator must point to a two byte character type");
187 if (code_point <= 0xffff) {
188 tt_axiom(!(code_point >= 0xd800 && code_point <= 0xdfff),
"Code Point must not be a surrogate-code");
189 *(it++) =
static_cast<value_type
>(code_point);
192 tt_axiom(code_point <= 0x10ffff,
"Code Point must be in range of the 17 planes");
194 code_point -= 0x10000;
195 *(it++) =
static_cast<value_type
>(code_point >> 10) | 0xd800;
196 *(it++) =
static_cast<value_type
>(code_point) & 0x03ff | 0xdc00;
208template<
typename BackInsertIterator>
209constexpr void utf32_to_utf8(
char32_t code_point, BackInsertIterator &it)
noexcept
211 using value_type =
typename BackInsertIterator::container_type::value_type;
212 static_assert(
sizeof(value_type) == 1,
"UTF-8 values must be stored in a 1 byte character type");
214 if (code_point <= 0x7f) {
215 *(it++) =
static_cast<value_type
>(code_point);
217 }
else if (code_point <= 0x07ff) {
218 *(it++) =
static_cast<value_type
>(code_point >> 6) | 0xc0;
219 *(it++) =
static_cast<value_type
>(code_point) & 0x3f | 0x80;
221 }
else if (code_point <= 0xffff) {
222 tt_axiom(!(code_point >= 0xd800 && code_point <= 0xdfff),
"Code Point must not be a surrogate");
223 *(it++) =
static_cast<value_type
>(code_point >> 12) | 0xe0;
224 *(it++) =
static_cast<value_type
>(code_point >> 6) & 0x3f | 0x80;
225 *(it++) =
static_cast<value_type
>(code_point) & 0x3f | 0x80;
228 tt_axiom(code_point <= 0x10ffff,
"Code Point must be in range of the 17 planes");
229 *(it++) =
static_cast<value_type
>(code_point >> 18) | 0xf0;
230 *(it++) =
static_cast<value_type
>(code_point >> 12) & 0x3f | 0x80;
231 *(it++) =
static_cast<value_type
>(code_point >> 6) & 0x3f | 0x80;
232 *(it++) =
static_cast<value_type
>(code_point) & 0x3f | 0x80;
247 for (
char32_t &c : r) {
248 if (c > 0x10'ffff || (c >= 0xd800 && c <= 0xdfff)) {
268template<
typename Container, std::endian Endian = std::endian::native>
269[[nodiscard]]
std::u16string make_u16string(Container
const &rhs)
noexcept
273 if constexpr (
sizeof(Container::value_type) == 1) {
275 r.
reserve((size(rhs) + 1) / 2);
276 for (ssize_t i = 0; i < ssize(rhs); i += 2) {
277 if constexpr (Endian == std::endian::little) {
278 r +=
static_cast<char16_t>(rhs[i]) << 8 |
static_cast<char16_t>(rhs[i + 1]);
280 r +=
static_cast<char16_t>(rhs[i]) |
static_cast<char16_t>(rhs[i + 1]) << 8;
283 if (size(rhs) % 2 == 1) {
290 r.reserve(size(rhs));
291 for (
auto &&c : rhs) {
292 r += Endian == std::endian::native ?
static_cast<char16_t>(c) : static_cast<char16_t>(byte_swap(c));
311 auto swap_endian =
false;
313 ttlet length = size(r);
315 while (i != length) {
316 auto code_unit = r[i];
318 code_unit = r[i] = byte_swap(code_unit);
321 if (code_unit == 0xfffe) {
323 swap_endian = !swap_endian;
326 }
else if (code_unit >= 0xd800 && code_unit <= 0xdbff) {
330 auto next_code_unit = r[i];
331 if (next_code_unit >= 0xdc00 && next_code_unit <= 0xdfff) {
338 r[old_i] =
char16_t{0xfffd};
343 r[old_i] =
char16_t{0xfffd};
346 }
else if (code_unit >= 0xdc00 && code_unit <= 0xdfff) {
348 r[i++] =
char16_t{0xfffd};
358template<
typename Container>
359[[nodiscard]] std::u8string make_u8string(Container
const &rhs)
noexcept
361 auto r = std::u8string{};
365 for (
auto &&c : rhs) {
366 r +=
static_cast<char8_t>(c);
380[[nodiscard]]
inline std::u8string sanitize_u8string(std::u8string &&rhs)
noexcept
384 ttlet first =
begin(rhs);
385 ttlet last =
end(rhs);
387 auto code_point =
char32_t{};
390 for (
auto it = first; valid && it != last;) {
392 valid &= utf8_to_utf32(it, last, code_point);
401 auto tmp = std::u8string{
begin(r), old_it};
402 tmp.reserve(size(r));
406 utf32_to_utf8(code_point, tmp_i);
409 for (
auto it = old_it + 1; it != last;) {
410 utf8_to_utf32(it, last, code_point);
411 utf32_to_utf8(code_point, tmp_i);
420template<
typename StringT>
421[[nodiscard]]
inline StringT to_u8string(std::u16string_view
const &rhs)
noexcept
424 r.reserve(rhs.size());
428 ttlet c32 = utf16_to_utf32(it);
429 utf32_to_utf8(c32, r_it);
434template<
typename StringT>
435[[nodiscard]]
inline StringT to_u8string(std::u32string_view
const &rhs)
noexcept
438 r.reserve(rhs.size());
441 for (
auto c32 : rhs) {
442 utf32_to_utf8(c32, r_it);
447template<
typename StringT>
448[[nodiscard]]
inline StringT to_u8string(std::wstring_view
const &rhs)
noexcept
450 if constexpr (
sizeof(std::wstring::value_type) == 2) {
451 auto s16 = sanitize_u16string(
std::u16string{
reinterpret_cast<char16_t const *
>(rhs.data()), rhs.
size()});
452 return to_u8string<StringT>(
std::move(s16));
454 auto s32 = sanitize_u32string(
std::u32string{
reinterpret_cast<char32_t const *
>(rhs.data()), rhs.
size()});
455 return to_u8string<StringT>(
std::move(s32));
469 return std::string{
reinterpret_cast<char const *
>(rhs.data()), rhs.
size()};
478[[nodiscard]]
inline std::u8string to_u8string(std::u16string_view
const &rhs)
noexcept
480 return detail::to_u8string<std::u8string>(rhs);
491 return detail::to_u8string<std::string>(rhs);
500[[nodiscard]]
inline std::u8string to_u8string(std::u32string_view
const &rhs)
noexcept
502 return detail::to_u8string<std::u8string>(rhs);
513 return detail::to_u8string<std::string>(rhs);
522[[nodiscard]]
inline std::u16string to_u16string(std::u8string_view
const &rhs)
noexcept
529 ttlet c32 = utf8_to_utf32(it);
530 utf32_to_utf16(c32, r_it);
541[[nodiscard]]
inline std::u16string to_u16string(std::u32string_view
const &rhs)
noexcept
547 for (
auto c32 : rhs) {
548 utf32_to_utf16(c32, r_it);
559[[nodiscard]]
inline std::u32string to_u32string(std::u8string_view
const &rhs)
noexcept
565 r += utf8_to_utf32(it);
576[[nodiscard]]
inline std::u32string to_u32string(std::u16string_view
const &rhs)
noexcept
582 r += utf16_to_utf32(it);
593[[nodiscard]]
inline std::u8string to_u8string(std::string_view
const &rhs)
noexcept
595 return sanitize_u8string(std::u8string{
reinterpret_cast<char8_t const *
>(rhs.data()), rhs.size()});
604[[nodiscard]]
inline std::u16string to_u16string(std::string_view
const &rhs)
noexcept
606 return to_u16string(sanitize_u8string(std::u8string{
reinterpret_cast<char8_t const *
>(rhs.data()), rhs.size()}));
615[[nodiscard]]
inline std::u32string to_u32string(std::string_view
const &rhs)
noexcept
617 return to_u32string(sanitize_u8string(std::u8string{
reinterpret_cast<char8_t const *
>(rhs.data()), rhs.size()}));
626[[nodiscard]]
inline std::u8string to_u8string(std::wstring_view
const &rhs)
noexcept
628 return detail::to_u8string<std::u8string>(rhs);
639 return detail::to_u8string<std::string>(rhs);
648[[nodiscard]]
inline std::u16string to_u16string(std::wstring_view
const &rhs)
noexcept
650 if constexpr (
sizeof(std::wstring::value_type) == 2) {
651 return sanitize_u16string(
std::u16string{
reinterpret_cast<char16_t const *
>(rhs.data()), rhs.
size()});
653 auto s32 = sanitize_u32string(
std::u32string{
reinterpret_cast<char32_t const *
>(rhs.data()), rhs.
size()});
664[[nodiscard]]
inline std::u32string to_u32string(std::wstring_view
const &rhs)
noexcept
666 if constexpr (
sizeof(std::wstring::value_type) == 2) {
667 auto s16 = sanitize_u16string(
std::u16string{
reinterpret_cast<char16_t const *
>(rhs.data()), rhs.
size()});
670 return sanitize_u32string(
std::u32string{
reinterpret_cast<char32_t const *
>(rhs.data()), rhs.
size()});
682 if constexpr (
sizeof(std::wstring::value_type) == 2) {
683 auto s16 = to_u16string(rhs);
684 return std::wstring{
reinterpret_cast<wchar_t const *
>(s16.data()), s16.
size()};
686 auto s32 = to_u32string(rhs);
687 return std::wstring{
reinterpret_cast<wchar_t const *
>(s32.data()), s32.
size()};
699 auto s8 = sanitize_u8string(std::u8string{
reinterpret_cast<char8_t const *
>(rhs.data()), rhs.size()});
711 if constexpr (
sizeof(std::wstring::value_type) == 2) {
712 return std::wstring{
reinterpret_cast<wchar_t const *
>(rhs.data()), rhs.
size()};
714 auto s32 = to_u32string(rhs);
715 return std::wstring{
reinterpret_cast<wchar_t const *
>(s32.data()), s32.
size()};
727 if constexpr (
sizeof(std::wstring::value_type) == 2) {
728 auto s16 = to_u16string(rhs);
729 return std::wstring{
reinterpret_cast<wchar_t const *
>(s16.data()), s16.
size()};
731 return std::wstring{
reinterpret_cast<wchar_t const *
>(rhs.data()), rhs.
size()};
T back_inserter(T... args)