132 enum class state_type : uint8_t {
156 dec_integer_found_e_id,
157 dec_integer_found_E_id,
162 dec_float_found_e_id,
163 dec_float_found_E_id,
173 sqstring_literal_quote,
174 sqstring_literal_escape,
176 dqstring_literal_quote,
177 dqstring_literal_escape,
179 bqstring_literal_quote,
180 bqstring_literal_escape,
183 block_comment_found_star,
184 block_comment_found_dash,
185 block_comment_found_dash_dash,
186 block_comment_found_dash_dash_fin0,
205 struct command_type {
208 state_type next_state = state_type::idle;
212 token::kind_type emit_token = token::none;
216 char char_to_capture =
'\0';
220 uint8_t clear : 1 = 0;
224 uint8_t advance : 1 = 0;
228 uint8_t assigned : 1 = 0;
232 uint8_t advance_line : 1 = 0;
236 uint8_t advance_tab : 1 = 0;
241 struct advance_tag {};
242 struct capture_tag {};
244 class excluding_tag {
246 constexpr excluding_tag(
std::string exclusions) noexcept : _exclusions(
std::move(exclusions)) {}
248 [[nodiscard]]
constexpr bool contains(
char c)
const noexcept
250 return _exclusions.
find(c) != _exclusions.npos;
259 constexpr static auto capture = capture_tag{};
263 constexpr static auto advance = advance_tag{};
267 constexpr static auto clear = clear_tag{};
271 constexpr static auto any = any_tag{};
274 [[nodiscard]]
constexpr excluding_tag excluding(
char const (&exclusions)[N])
noexcept
276 return excluding_tag{
std::string(exclusions, N - 1)};
279 template<
typename First,
typename... Args>
280 [[nodiscard]]
constexpr static bool _has_advance_tag_argument()
noexcept
282 if constexpr (std::is_same_v<First, advance_tag>) {
284 }
else if constexpr (
sizeof...(Args) == 0) {
287 return _has_advance_tag_argument<Args...>();
291 template<
typename... Args>
292 [[nodiscard]]
constexpr static bool has_advance_tag_argument()
noexcept
294 if constexpr (
sizeof...(Args) == 0) {
297 return _has_advance_tag_argument<Args...>();
302 constexpr lexer() noexcept : _transition_table()
304 using enum state_type;
306 add(idle,
'/', found_slash, advance, capture);
307 add(idle,
'<', found_lt, advance, capture);
308 add(idle,
'#', found_hash, advance, capture);
309 add(idle,
'.', found_dot, advance, capture);
310 add(idle,
'=', found_eq, advance, capture);
311 add(idle,
':', found_colon, advance, capture);
313 add(found_slash, any, idle, token::other);
314 add(found_lt, any, idle, token::other);
315 add(found_hash, any, idle, token::other);
316 add(found_dot, any, idle, token::other);
317 add(found_eq, any, idle, token::other);
318 add(found_colon, any, idle, token::other);
321 add_string_literals();
324 add_number_literals();
330 add_ini_assignment();
332 add(idle,
"~!@$%^&*()-+[]{}\\|,>?", idle, token::other, capture, advance);
335 for (uint8_t i = 0; i != 128; ++i) {
336 auto& command = get_command(idle, char_cast<char>(i));
337 if (not command.assigned) {
338 command.assigned = 1;
341 command.char_to_capture = char_cast<char>(i);
342 command.emit_token = token::error_unexepected_character;
343 command.next_state = idle;
348 [[nodiscard]]
constexpr command_type& get_command(state_type from,
char c)
noexcept
350 return _transition_table[to_underlying(from) * 128_uz + char_cast<size_t>(c)];
353 [[nodiscard]]
constexpr command_type
const& get_command(state_type from,
char c)
const noexcept
355 return _transition_table[to_underlying(from) * 128_uz + char_cast<size_t>(c)];
358 template<
typename It, std::sentinel_for<It> ItEnd>
368 _lexer(
lexer), _first(first), _last(last), _it(first)
372 _token.kind = parse_token();
373 }
while (Config.filter_white_space and _token.kind == token::ws);
376 [[nodiscard]]
constexpr reference operator*()
const noexcept
381 [[nodiscard]]
constexpr pointer operator&()
const noexcept
386 constexpr iterator& operator++()
noexcept
388 hi_axiom(*
this != std::default_sentinel);
390 _token.kind = parse_token();
391 }
while (Config.filter_white_space and _token.kind == token::ws);
395 constexpr void operator++(
int)
noexcept
400 [[nodiscard]]
constexpr bool operator==(std::default_sentinel_t)
const noexcept
402 return _token.kind == token::none;
412 state_type _state = state_type::idle;
414 size_t _column_nr = 0;
420 constexpr void clear()
noexcept
422 _token.capture.
clear();
429 constexpr void capture(
char code_point)
noexcept
438 constexpr void capture(
char32_t code_point)
noexcept
443 char_map<
"utf-8">{}.write(code_point, out_it);
446 constexpr void advance_counters()
noexcept
448 if (_cp ==
'\n' or _cp ==
'\v' or _cp ==
'\f' or _cp ==
'\x85' or _cp == U
'\u2028' or _cp == U
'\u2029') {
450 }
else if (_cp ==
'\t') {
463 [[nodiscard]]
constexpr char32_t advance()
noexcept
469 hilet[code_point, valid] =
char_map<
"utf-8">{}.read(_it, _last);
473 [[nodiscard]]
constexpr token::kind_type parse_token_unicode_identifier()
noexcept
475 switch (ucd_get_lexical_class(_cp & 0x1f'ffff)) {
476 case unicode_lexical_class::id_start:
477 case unicode_lexical_class::id_continue:
484 if (Config.minus_in_identifier and _cp ==
'-') {
491 _state = state_type::idle;
497 [[nodiscard]]
constexpr token::kind_type parse_token_unicode_line_comment()
noexcept
499 hilet cp_ = _cp & 0x1f'ffff;
500 if (cp_ == U
'\u0085' or cp_ == U
'\u2028' or cp_ == U
'\u2029') {
501 _state = state_type::idle;
504 return token::lcomment;
514 [[nodiscard]]
constexpr token::kind_type parse_token_unicode_white_space()
noexcept
516 if (ucd_get_lexical_class(_cp & 0x1f'ffff) == unicode_lexical_class::white_space) {
523 _state = state_type::idle;
528 [[nodiscard]]
constexpr token::kind_type parse_token_unicode_idle()
noexcept
530 switch (ucd_get_lexical_class(_cp & 0x1f'ffff)) {
531 case unicode_lexical_class::id_start:
532 _state = state_type::identifier;
538 case unicode_lexical_class::white_space:
539 _state = state_type::white_space;
545 case unicode_lexical_class::syntax:
546 _state = state_type::idle;
556 return token::error_unexepected_character;
560 [[nodiscard]] hi_no_inline
constexpr token::kind_type parse_token_unicode()
noexcept
562 using enum state_type;
567 return parse_token_unicode_idle();
570 return parse_token_unicode_white_space();
573 return parse_token_unicode_line_comment();
576 return parse_token_unicode_identifier();
578 case dqstring_literal:
579 case sqstring_literal:
580 case bqstring_literal:
589 if (_cp == U
'\u0085' or _cp == U
'\u2028' or _cp == U
'\u2029') {
601 return process_command();
605 [[nodiscard]]
constexpr token::kind_type process_command(
char c =
'\0')
noexcept
607 hilet command = _lexer->get_command(_state, c);
608 _state = command.next_state;
614 if (command.char_to_capture !=
'\0') {
615 capture(command.char_to_capture);
618 if (command.advance) {
619 if (command.advance_line) {
622 }
else if (command.advance_tab) {
632 return command.emit_token;
635 [[nodiscard]]
constexpr token::kind_type parse_token()
noexcept
637 _token.line_nr = _line_nr;
638 _token.column_nr = _column_nr;
641 while (_cp <= 0x7fff'ffff) {
643 if (
auto token_kind = process_command(char_cast<char>(_cp)); token_kind != token::none) {
648 auto emit_token = parse_token_unicode();
649 if (emit_token != token::none) {
656 while (_state != state_type::idle) {
657 if (
auto token_kind = process_command(); token_kind != token::none) {
668 static_assert(std::movable<iterator<std::string::iterator, std::string::iterator>>);
669 static_assert(std::is_same_v<std::iterator_traits<iterator<std::string::iterator, std::string::iterator>>::value_type,
token>);
670 static_assert(std::input_or_output_iterator<iterator<std::string::iterator, std::string::iterator>>);
671 static_assert(std::weakly_incrementable<iterator<std::string::iterator, std::string::iterator>>);
679 template<
typename It, std::sentinel_for<It> ItEnd>
690 [[nodiscard]]
constexpr auto parse(std::string_view str)
const noexcept
692 return parse(str.begin(), str.end());
699 using transition_table_type =
std::array<command_type, to_underlying(state_type::_size) * 128>;
701 transition_table_type _transition_table;
703 constexpr void add_string_literal(
705 token::kind_type string_token,
706 state_type string_literal,
707 state_type string_literal_quote,
708 state_type string_literal_escape)
noexcept
710 using enum state_type;
712 add(idle, c, string_literal, advance);
713 add(string_literal, any, idle, token::error_incomplete_string);
714 for (uint8_t i = 1; i != 128; ++i) {
715 if (char_cast<char>(i) != c and char_cast<char>(i) !=
'\\') {
716 add(string_literal, char_cast<char>(i), string_literal, advance, capture);
720 if constexpr (Config.escape_by_quote_doubling) {
722 add(string_literal, c, string_literal_quote, advance);
724 add(string_literal_quote, any, idle, string_token);
726 add(string_literal_quote, c, string_literal, advance, capture);
729 add(string_literal, c, idle, advance, string_token);
733 add(string_literal,
'\\', string_literal_escape, advance, capture);
734 add(string_literal_escape, any, idle, token::error_incomplete_string);
735 for (uint8_t i = 1; i != 128; ++i) {
736 add(string_literal_escape, char_cast<char>(i), string_literal, advance, capture);
740 constexpr void add_string_literals() noexcept
742 using enum state_type;
744 if constexpr (Config.has_single_quote_string_literal) {
745 add_string_literal(
'\'', token::sstr, sqstring_literal, sqstring_literal_quote, sqstring_literal_escape);
747 add(idle,
'\'', idle, token::other, advance, capture);
750 if constexpr (Config.has_double_quote_string_literal) {
751 add_string_literal(
'"', token::dstr, dqstring_literal, dqstring_literal_quote, dqstring_literal_escape);
753 add(idle,
'"', idle, token::other, advance, capture);
756 if constexpr (Config.has_back_quote_string_literal) {
757 add_string_literal(
'`', token::bstr, bqstring_literal, bqstring_literal_quote, bqstring_literal_escape);
759 add(idle,
'`', idle, token::other, advance, capture);
763 constexpr void add_number_literals() noexcept
765 using enum state_type;
767 add(idle,
"0", zero, advance, capture);
768 add(idle,
"123456789", dec_integer, advance, capture);
770 add(zero, any, idle, token::integer);
771 add(zero,
".", dec_float, advance, capture);
772 add(zero,
"b", zero_b, advance);
773 add(zero,
"B", zero_B, advance);
774 add(zero,
"o", zero_o, advance);
775 add(zero,
"O", zero_O, advance);
776 add(zero,
"d", zero_d, advance);
777 add(zero,
"D", zero_D, advance);
778 add(zero,
"x", zero_x, advance);
779 add(zero,
"X", zero_X, advance);
781 add(zero_b, any, zero_b_id, token::integer);
782 add(zero_B, any, zero_B_id, token::integer);
783 add(zero_o, any, zero_o_id, token::integer);
784 add(zero_O, any, zero_O_id, token::integer);
785 add(zero_d, any, zero_d_id, token::integer);
786 add(zero_D, any, zero_D_id, token::integer);
787 add(zero_x, any, zero_x_id, token::integer);
788 add(zero_X, any, zero_X_id, token::integer);
789 add(zero_b,
"0123456789", bin_integer,
'b');
790 add(zero_B,
"0123456789", bin_integer,
'B');
791 add(zero_o,
"0123456789", oct_integer,
'o');
792 add(zero_O,
"0123456789", oct_integer,
'O');
793 add(zero_d,
"0123456789", dec_integer,
'd');
794 add(zero_D,
"0123456789", dec_integer,
'D');
795 add(zero_x,
"0123456789.", hex_integer,
'x');
796 add(zero_X,
"0123456789.", hex_integer,
'X');
798 add(zero_b_id, any, identifier,
'b');
799 add(zero_B_id, any, identifier,
'B');
800 add(zero_o_id, any, identifier,
'o');
801 add(zero_O_id, any, identifier,
'O');
802 add(zero_d_id, any, identifier,
'd');
803 add(zero_D_id, any, identifier,
'D');
804 add(zero_x_id, any, identifier,
'x');
805 add(zero_X_id, any, identifier,
'X');
807 if constexpr (Config.zero_starts_octal) {
808 add(zero,
"01234567", oct_integer, advance, capture);
809 add(zero,
"89", idle, token::error_invalid_digit);
811 add(zero,
"0123456789", dec_integer, advance, capture);
815 add(bin_integer, any, idle, token::integer);
816 add(bin_integer,
"01", bin_integer, advance, capture);
817 add(bin_integer,
"23456789", idle, token::error_invalid_digit);
820 add(oct_integer, any, idle, token::integer);
821 add(oct_integer,
"01234567", oct_integer, advance, capture);
822 add(oct_integer,
"89", idle, token::error_invalid_digit);
825 add(dec_integer, any, idle, token::integer);
826 add(dec_integer,
"0123456789", dec_integer, advance, capture);
827 add(dec_integer,
".", dec_float, advance, capture);
828 add(dec_integer,
"e", dec_integer_found_e, advance);
829 add(dec_integer,
"E", dec_integer_found_E, advance);
830 add(dec_integer_found_e, any, dec_integer_found_e_id, token::integer);
831 add(dec_integer_found_E, any, dec_integer_found_E_id, token::integer);
832 add(dec_integer_found_e,
"+-0123456789", dec_sign_exponent,
'e');
833 add(dec_integer_found_E,
"+-0123456789", dec_sign_exponent,
'E');
834 add(dec_integer_found_e_id, any, identifier,
'e');
835 add(dec_integer_found_E_id, any, identifier,
'E');
838 add(hex_integer, any, idle, token::integer);
839 add(hex_integer,
"0123456789abcdefABCDEF", hex_integer, advance, capture);
840 add(hex_integer,
".", hex_float, advance, capture);
841 add(hex_integer,
"pP", hex_sign_exponent, advance, capture);
844 add(found_dot,
"0123456789eE", dec_float);
845 add(dec_float, any, idle, token::real);
846 add(dec_float,
"0123456789", dec_float, advance, capture);
847 add(dec_float,
"e", dec_float_found_e, advance);
848 add(dec_float,
"E", dec_float_found_E, advance);
849 add(dec_float_found_e, any, dec_float_found_e_id, token::real);
850 add(dec_float_found_E, any, dec_float_found_E_id, token::real);
851 add(dec_float_found_e,
"+-0123456789", dec_sign_exponent,
'e');
852 add(dec_float_found_E,
"+-0123456789", dec_sign_exponent,
'E');
853 add(dec_float_found_e_id, any, identifier,
'e');
854 add(dec_float_found_E_id, any, identifier,
'E');
856 add(dec_sign_exponent, any, idle, token::error_incomplete_exponent);
857 add(dec_sign_exponent,
"0123456789", dec_exponent_more, advance, capture);
858 add(dec_sign_exponent,
"+-", dec_exponent, advance, capture);
859 add(dec_exponent, any, idle, token::error_incomplete_exponent);
860 add(dec_exponent,
"0123456789", dec_exponent_more, advance, capture);
861 add(dec_exponent_more, any, idle, token::real);
862 add(dec_exponent_more,
"0123456789", dec_exponent_more, advance, capture);
865 add(hex_float, any, idle, token::real);
866 add(hex_float,
"0123456789abcdefABCDEF", hex_float, advance, capture);
867 add(hex_float,
"pP", hex_sign_exponent, advance, capture);
868 add(hex_sign_exponent, any, idle, token::error_incomplete_exponent);
869 add(hex_sign_exponent,
"0123456789abcdefABCDEF", hex_exponent_more, advance, capture);
870 add(hex_sign_exponent,
"+-", hex_exponent, advance, capture);
871 add(hex_exponent, any, idle, token::error_incomplete_exponent);
872 add(hex_exponent,
"0123456789abcdefABCDEF", hex_exponent_more, advance, capture);
873 add(hex_exponent_more, any, idle, token::real);
874 add(hex_exponent_more,
"0123456789abcdefABCDEF", hex_exponent_more, advance, capture);
876 if constexpr (Config.digit_separator !=
'\0') {
877 if constexpr (Config.zero_starts_octal) {
878 add(zero, Config.digit_separator, oct_integer, advance);
880 add(zero, Config.digit_separator, dec_integer, advance);
882 add(bin_integer, Config.digit_separator, bin_integer, advance);
883 add(oct_integer, Config.digit_separator, oct_integer, advance);
884 add(dec_integer, Config.digit_separator, dec_integer, advance);
885 add(hex_integer, Config.digit_separator, hex_integer, advance);
886 add(dec_float, Config.digit_separator, dec_integer, advance);
887 add(hex_float, Config.digit_separator, dec_integer, advance);
888 add(dec_exponent, Config.digit_separator, dec_integer, advance);
889 add(hex_exponent, Config.digit_separator, dec_integer, advance);
893 constexpr void add_color_literal() noexcept
895 using enum state_type;
897 if constexpr (Config.has_color_literal) {
898 add(found_hash,
"0123456789abcdefABCDEF", color_literal, clear, capture, advance);
899 add(color_literal, any, idle, token::color);
900 add(color_literal,
"0123456789abcdefABCDEF", color_literal, advance, capture);
904 constexpr void add_ini_assignment() noexcept
906 using enum state_type;
908 if constexpr (Config.equal_is_ini_assignment) {
910 add(found_eq,
" \t", found_eq, advance);
911 add(found_eq,
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", ini_string, token::other);
914 if constexpr (Config.colon_is_ini_assignment) {
916 add(found_colon,
" \t", found_colon, advance);
917 add(found_colon,
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", ini_string, token::other);
920 add(ini_string, any, idle, token::istr);
921 add(ini_string, excluding(
"\n\v\f\r\0"), ini_string, advance, capture);
922 add(ini_string,
'\r', ini_string, advance);
925 constexpr void add_comments() noexcept
927 using enum state_type;
929 if constexpr (Config.has_double_slash_line_comment) {
930 add(found_slash,
'/', line_comment, clear, advance);
933 if constexpr (Config.has_semicolon_line_comment) {
934 add(idle,
';', line_comment, advance);
936 add(idle,
';', idle, token::other, capture, advance);
939 if constexpr (Config.has_hash_line_comment) {
940 add(found_hash, excluding(
"\0"), line_comment, clear, advance, capture);
943 if constexpr (Config.has_c_block_comment) {
944 add(found_slash,
'*', block_comment, advance, clear);
947 if constexpr (Config.has_sgml_block_comment) {
948 add(found_lt,
'!', found_lt_bang, advance);
949 add(found_lt_bang, any, idle, token::error_after_lt_bang);
950 add(found_lt_bang,
'-', found_lt_bang_dash, advance);
951 add(found_lt_bang_dash, any, idle, token::error_after_lt_bang);
952 add(found_lt_bang_dash,
'-', block_comment, advance);
955 add(line_comment, any, idle, token::lcomment);
956 add(line_comment, excluding(
"\r\n\f\v\0"), line_comment, advance, capture);
958 add(line_comment,
'\r', line_comment, advance);
959 add(line_comment,
"\n\f\v", idle, advance, token::lcomment);
961 add(block_comment, any, idle, token::error_incomplete_comment);
963 static_assert(Config.has_c_block_comment == 0 or Config.has_sgml_block_comment == 0);
965 if constexpr (Config.has_c_block_comment) {
966 add(block_comment, excluding(
"*\0"), block_comment, advance, capture);
967 add(block_comment,
'*', block_comment_found_star, advance);
968 add(block_comment_found_star, any, block_comment,
'*');
969 add(block_comment_found_star,
'/', idle, advance, token::bcomment);
971 }
else if constexpr (Config.has_sgml_block_comment) {
972 add(block_comment, excluding(
"-\0"), block_comment, advance, capture);
973 add(block_comment,
'-', block_comment_found_dash, advance);
974 add(block_comment_found_dash, any, block_comment,
'-');
975 add(block_comment_found_dash,
'-', block_comment_found_dash_dash, advance);
976 add(block_comment_found_dash_dash, any, block_comment_found_dash_dash_fin0,
'-');
977 add(block_comment_found_dash_dash_fin0, any, block_comment,
'-');
978 add(block_comment_found_dash_dash,
'>', idle, advance, token::bcomment);
982 constexpr void add_white_space() noexcept
984 using enum state_type;
986 add(idle,
'\r', white_space, advance);
987 add(idle,
" \n\t\v\f", white_space, advance, capture);
988 add(white_space, any, idle, token::ws);
989 add(white_space,
'\r', white_space, advance);
990 add(white_space,
" \n\t\v\f", white_space, advance, capture);
993 constexpr void add_identifier() noexcept
995 using enum state_type;
997 add(idle,
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", identifier, advance, capture);
998 add(identifier, any, idle, token::id);
999 add(identifier,
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789", identifier, advance, capture);
1000 if constexpr (Config.minus_in_identifier) {
1001 add(identifier,
'-', identifier, advance, capture);
1005 constexpr command_type& _add(state_type from,
char c, state_type to)
noexcept
1007 auto& command = get_command(from, c);
1008 command.next_state = to;
1009 command.char_to_capture =
'\0';
1010 command.advance = 0;
1011 command.advance_line = 0;
1012 command.advance_tab = 0;
1014 command.emit_token = token::none;
1032 template<
typename First,
typename... Args>
1033 constexpr command_type& _add(state_type from,
char c, state_type to, First first, Args
const&...args)
noexcept
1035 auto& command = _add(from, c, to, args...);
1036 if constexpr (std::is_same_v<First, token::kind_type>) {
1037 command.emit_token = first;
1039 }
else if constexpr (std::is_same_v<First, advance_tag>) {
1040 command.advance = 1;
1041 if (c ==
'\n' or c ==
'\v' or c ==
'\f') {
1042 command.advance_line = 1;
1043 }
else if (c ==
'\t') {
1044 command.advance_tab = 1;
1047 }
else if constexpr (std::is_same_v<First, clear_tag>) {
1050 }
else if constexpr (std::is_same_v<First, capture_tag>) {
1051 command.char_to_capture = c;
1053 }
else if constexpr (std::is_same_v<First, char>) {
1054 command.char_to_capture = first;
1063 template<
typename... Args>
1064 constexpr void add(state_type from,
char c, state_type to, Args
const&...args)
noexcept
1066 auto& command = _add(from, c, to, args...);
1067 hi_assert(not command.assigned,
"Overwriting an already assigned state:char combination.");
1068 command.assigned =
true;
1071 template<
typename... Args>
1072 constexpr void add(state_type from, std::string_view str, state_type to, Args
const&...args)
noexcept
1074 for (
auto c : str) {
1075 auto& command = _add(from, c, to, args...);
1076 hi_assert(not command.assigned,
"Overwriting an already assigned state:char combination.");
1077 command.assigned =
true;
1081 template<
typename... Args>
1082 constexpr void add(state_type from, any_tag, state_type to, Args
const&...args)
noexcept
1084 static_assert(not has_advance_tag_argument<Args...>(),
"any should not advance");
1086 for (uint8_t c = 0; c != 128; ++c) {
1087 hilet& command = _add(from, char_cast<char>(c), to, args...);
1088 hi_assert(not command.assigned,
"any should be added first to a state");
1092 template<
typename... Args>
1093 constexpr void add(state_type from, excluding_tag
const& exclusions, state_type to, Args
const&...args)
noexcept
1095 for (uint8_t c = 0; c != 128; ++c) {
1096 if (not exclusions.contains(char_cast<char>(c))) {
1097 auto& command = _add(from, char_cast<char>(c), to, args...);
1098 hi_assert(not command.assigned,
"Overwriting an already assigned state:char combination.");
1099 command.assigned =
true;