HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
lexer.hpp
1// Copyright Take Vos 2023.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "token.hpp"
8#include "../utility/utility.hpp"
9#include "../unicode/unicode.hpp"
10#include "../char_maps/module.hpp"
11#include <ranges>
12#include <iterator>
13#include <cstdint>
14#include <string>
15#include <string_view>
16#include <format>
17#include <ostream>
18
19namespace hi { inline namespace v1 {
20
27 uint16_t zero_starts_octal : 1 = 0;
28
31 uint16_t escape_by_quote_doubling : 1 = 0;
32
37 uint16_t has_color_literal : 1 = 0;
38
39 uint16_t has_double_quote_string_literal : 1 = 0;
40 uint16_t has_single_quote_string_literal : 1 = 0;
41 uint16_t has_back_quote_string_literal : 1 = 0;
42
43 uint16_t has_double_slash_line_comment : 1 = 0;
44 uint16_t has_hash_line_comment : 1 = 0;
45 uint16_t has_semicolon_line_comment : 1 = 0;
46
47 uint16_t has_c_block_comment : 1 = 0;
48 uint16_t has_sgml_block_comment : 1 = 0;
49 uint16_t filter_white_space : 1 = 0;
50 uint16_t filter_comment : 1 = 0;
51
54 uint16_t has_dot_star_operator : 1 = 0;
55
58 uint16_t has_dot_dot_operator : 1 = 0;
59
68 uint16_t equal_is_ini_assignment : 1 = 0;
69
79 uint16_t colon_is_ini_assignment : 1 = 0;
80
81 uint16_t minus_in_identifier : 1 = 0;
82
91 char digit_separator = '\0';
92
93 [[nodiscard]] constexpr static lexer_config sh_style() noexcept
94 {
95 auto r = lexer_config{};
96 r.has_single_quote_string_literal = 1;
97 r.has_double_quote_string_literal = 1;
98 r.has_hash_line_comment = 1;
99 r.filter_white_space = 1;
100 r.filter_comment = 1;
101 return r;
102 }
103
104 [[nodiscard]] constexpr static lexer_config json_style() noexcept
105 {
106 auto r = lexer_config{};
107 r.has_single_quote_string_literal = 1;
108 r.has_double_quote_string_literal = 1;
109 r.has_double_slash_line_comment = 1;
110 r.filter_white_space = 1;
111 r.filter_comment = 1;
112 return r;
113 }
114
115 [[nodiscard]] constexpr static lexer_config c_style() noexcept
116 {
117 auto r = lexer_config{};
118 r.filter_white_space = 1;
119 r.zero_starts_octal = 1;
120 r.digit_separator = '\'';
121 r.has_double_quote_string_literal = 1;
122 r.has_single_quote_string_literal = 1;
123 r.has_double_slash_line_comment = 1;
124 r.has_c_block_comment = 1;
125 r.has_dot_star_operator = 1;
126 r.has_dot_dot_operator = 1;
127 return r;
128 }
129
130 [[nodiscard]] constexpr static lexer_config css_style() noexcept
131 {
132 auto r = lexer_config{};
133 r.filter_white_space = 1;
134 r.filter_comment = 1;
135 r.has_color_literal = 1;
136 r.has_double_quote_string_literal = 1;
137 r.has_double_slash_line_comment = 1;
138 r.has_c_block_comment = 1;
139 r.minus_in_identifier = 1;
140 return r;
141 }
142
143 [[nodiscard]] constexpr static lexer_config ini_style() noexcept
144 {
145 auto r = lexer_config{};
146 r.filter_white_space = 1;
147 r.digit_separator = '_';
148 r.has_double_quote_string_literal = 1;
149 r.has_single_quote_string_literal = 1;
150 r.has_semicolon_line_comment = 1;
151 r.has_color_literal = 1;
152 r.equal_is_ini_assignment = 1;
153 return r;
154 }
155};
156
157namespace detail {
158
161template<lexer_config Config>
162class lexer {
163private:
164 enum class state_type : uint8_t {
165 idle,
166 zero,
167 zero_b,
168 zero_B,
169 zero_o,
170 zero_O,
171 zero_d,
172 zero_D,
173 zero_x,
174 zero_X,
175 zero_b_id,
176 zero_B_id,
177 zero_o_id,
178 zero_O_id,
179 zero_d_id,
180 zero_D_id,
181 zero_x_id,
182 zero_X_id,
183 bin_integer,
184 oct_integer,
185 dec_integer,
186 dec_integer_found_e,
187 dec_integer_found_E,
188 dec_integer_found_e_id,
189 dec_integer_found_E_id,
190 hex_integer,
191 dec_float,
192 dec_float_found_e,
193 dec_float_found_E,
194 dec_float_found_e_id,
195 dec_float_found_E_id,
196 hex_float,
197 dec_sign_exponent,
198 hex_sign_exponent,
199 dec_exponent,
200 hex_exponent,
201 dec_exponent_more,
202 hex_exponent_more,
203 color_literal,
204 sqstring_literal,
205 sqstring_literal_quote,
206 sqstring_literal_escape,
207 dqstring_literal,
208 dqstring_literal_quote,
209 dqstring_literal_escape,
210 bqstring_literal,
211 bqstring_literal_quote,
212 bqstring_literal_escape,
213 line_comment,
214 block_comment,
215 block_comment_found_star,
216 block_comment_found_dash,
217 block_comment_found_dash_dash,
218 block_comment_found_dash_dash_fin0,
219 found_colon,
220 found_dot,
221 found_dot_dot,
222 found_eq,
223 found_eq_eq,
224 found_hash,
225 found_lt,
226 found_lt_lt,
227 found_lt_bang,
228 found_lt_bang_dash,
229 found_lt_eq,
230 found_slash,
231 found_plus,
232 found_minus,
233 found_minus_gt,
234 found_star,
235 found_and,
236 found_vbar,
237 found_caret,
238 found_percent,
239 found_bang,
240 found_question,
241 found_tilde,
242 found_gt,
243 found_gt_gt,
244 ini_string,
245 white_space,
246 identifier,
247
248 _size
249 };
250
253 struct command_type {
256 state_type next_state = state_type::idle;
257
260 token::kind_type emit_token = token::none;
261
264 char char_to_capture = '\0';
265
268 uint8_t clear : 1 = 0;
269
272 uint8_t advance : 1 = 0;
273
276 uint8_t assigned : 1 = 0;
277
280 uint8_t advance_line : 1 = 0;
281
284 uint8_t advance_tab : 1 = 0;
285 };
286
287 struct clear_tag {};
288 struct any_tag {};
289 struct advance_tag {};
290 struct capture_tag {};
291
292 class excluding_tag {
293 public:
294 constexpr excluding_tag(std::string exclusions) noexcept : _exclusions(std::move(exclusions)) {}
295
296 [[nodiscard]] constexpr bool contains(char c) const noexcept
297 {
298 return _exclusions.find(c) != _exclusions.npos;
299 }
300
301 private:
302 std::string _exclusions;
303 };
304
307 constexpr static auto capture = capture_tag{};
308
311 constexpr static auto advance = advance_tag{};
312
315 constexpr static auto clear = clear_tag{};
316
319 constexpr static auto any = any_tag{};
320
321 template<size_t N>
322 [[nodiscard]] constexpr excluding_tag excluding(char const (&exclusions)[N]) noexcept
323 {
324 return excluding_tag{std::string(exclusions, N - 1)};
325 }
326
327 template<typename First, typename... Args>
328 [[nodiscard]] constexpr static bool _has_advance_tag_argument() noexcept
329 {
330 if constexpr (std::is_same_v<First, advance_tag>) {
331 return true;
332 } else if constexpr (sizeof...(Args) == 0) {
333 return false;
334 } else {
335 return _has_advance_tag_argument<Args...>();
336 }
337 }
338
339 template<typename... Args>
340 [[nodiscard]] constexpr static bool has_advance_tag_argument() noexcept
341 {
342 if constexpr (sizeof...(Args) == 0) {
343 return false;
344 } else {
345 return _has_advance_tag_argument<Args...>();
346 }
347 }
348
349public:
350 constexpr lexer() noexcept : _transition_table()
351 {
352 using enum state_type;
353
354 add(idle, '/', found_slash, advance, capture);
355 add(idle, '<', found_lt, advance, capture);
356 add(idle, '#', found_hash, advance, capture);
357 add(idle, '.', found_dot, advance, capture);
358 add(idle, '=', found_eq, advance, capture);
359 add(idle, ':', found_colon, advance, capture);
360
361 add(found_slash, any, idle, token::other);
362 add(found_lt, any, idle, token::other);
363 add(found_hash, any, idle, token::other);
364 add(found_dot, any, idle, token::other);
365 add(found_eq, any, idle, token::other);
366 add(found_colon, any, idle, token::other);
367
368 // Adds the starters "\"'`"
369 add_string_literals();
370
371 // Adds the starters "0123456789"
372 add_number_literals();
373
374 add_color_literal();
375 add_comments();
376 add_white_space();
377 add_identifier();
378 add_ini_assignment();
379 add_others();
380
381 // All unused entries of the idle state are unexpected characters.
382 for (uint8_t i = 0; i != 128; ++i) {
383 auto& command = get_command(idle, char_cast<char>(i));
384 if (not command.assigned) {
385 command.assigned = 1;
386 command.advance = 1;
387 // If there are actual null characters in the string then nothing gets captured.
388 command.char_to_capture = char_cast<char>(i);
389 command.emit_token = token::error_unexepected_character;
390 command.next_state = idle;
391 }
392 }
393 }
394
395 [[nodiscard]] constexpr command_type& get_command(state_type from, char c) noexcept
396 {
397 return _transition_table[std::to_underlying(from) * 128_uz + char_cast<size_t>(c)];
398 }
399
400 [[nodiscard]] constexpr command_type const& get_command(state_type from, char c) const noexcept
401 {
402 return _transition_table[std::to_underlying(from) * 128_uz + char_cast<size_t>(c)];
403 }
404
405 struct proxy {
406 using value_type = token;
407 using reference = value_type const&;
408
409 value_type _v;
410
411 reference operator*() const noexcept
412 {
413 return _v;
414 }
415 };
416
417 template<typename It, std::sentinel_for<It> ItEnd>
418 struct iterator {
419 public:
421 using value_type = token;
422 using reference = value_type const&;
423 using pointer = value_type const *;
425
426 constexpr iterator(lexer const *lexer, It first, ItEnd last) noexcept :
427 _lexer(lexer), _first(first), _last(last), _it(first)
428 {
429 _cp = advance();
430 do {
431 _token.kind = parse_token();
432 } while (is_token_filtered(_token));
433 }
434
435 [[nodiscard]] constexpr static bool is_token_filtered(token x) noexcept
436 {
437 return (Config.filter_white_space and x == token::ws) or (Config.filter_comment and x == token::lcomment) or
438 (Config.filter_comment and x == token::bcomment);
439 }
440
441 [[nodiscard]] constexpr reference operator*() const noexcept
442 {
443 return _token;
444 }
445
446 [[nodiscard]] constexpr pointer operator->() const noexcept
447 {
448 return std::addressof(_token);
449 }
450
451 constexpr iterator& operator++() noexcept
452 {
453 hi_axiom(*this != std::default_sentinel);
454 do {
455 _token.kind = parse_token();
456 } while (is_token_filtered(_token));
457 return *this;
458 }
459
460 constexpr proxy operator++(int) noexcept
461 {
462 auto r = proxy{**this};
463 ++(*this);
464 return r;
465 }
466
467 [[nodiscard]] constexpr bool operator==(std::default_sentinel_t) const noexcept
468 {
469 return _token.kind == token::none;
470 }
471
472 private:
473 lexer const *_lexer;
474 It _first;
475 ItEnd _last;
476 It _it;
477 char32_t _cp = 0;
478 token _token;
479 state_type _state = state_type::idle;
480 size_t _line_nr = 0;
481 size_t _column_nr = 0;
482
487 constexpr void clear() noexcept
488 {
489 _token.capture.clear();
490 }
491
496 constexpr void capture(char code_point) noexcept
497 {
498 _token.capture.push_back(code_point);
499 }
500
505 constexpr void capture(char32_t code_point) noexcept
506 {
507 hi_axiom(code_point < 0x7fff'ffff);
508
509 auto out_it = std::back_inserter(_token.capture);
510 char_map<"utf-8">{}.write(code_point, out_it);
511 }
512
513 constexpr void advance_counters() noexcept
514 {
515 if (_cp == '\n' or _cp == '\v' or _cp == '\f' or _cp == '\x85' or _cp == U'\u2028' or _cp == U'\u2029') {
516 ++_line_nr;
517 } else if (_cp == '\t') {
518 _column_nr /= 8;
519 ++_column_nr;
520 _column_nr *= 8;
521 } else {
522 ++_column_nr;
523 }
524 }
525
530 [[nodiscard]] constexpr char32_t advance() noexcept
531 {
532 if (_it == _last) {
533 return 0xffff'ffff;
534 }
535
536 hilet[code_point, valid] = char_map<"utf-8">{}.read(_it, _last);
537 return code_point;
538 }
539
540 [[nodiscard]] constexpr token::kind_type parse_token_unicode_identifier() noexcept
541 {
542 switch (ucd_get_lexical_class(_cp & 0x1f'ffff)) {
543 case unicode_lexical_class::id_start:
544 case unicode_lexical_class::id_continue:
545 capture(_cp);
546 advance_counters();
547 _cp = advance();
548 return token::none;
549
550 default:
551 if (Config.minus_in_identifier and _cp == '-') {
552 capture(_cp);
553 advance_counters();
554 _cp = advance();
555 return token::none;
556
557 } else {
558 _state = state_type::idle;
559 return token::id;
560 }
561 }
562 }
563
564 [[nodiscard]] constexpr token::kind_type parse_token_unicode_line_comment() noexcept
565 {
566 hilet cp_ = _cp & 0x1f'ffff;
567 if (cp_ == U'\u0085' or cp_ == U'\u2028' or cp_ == U'\u2029') {
568 _state = state_type::idle;
569 advance_counters();
570 _cp = advance();
571 return token::lcomment;
572
573 } else {
574 capture(_cp);
575 advance_counters();
576 _cp = advance();
577 return token::none;
578 }
579 }
580
581 [[nodiscard]] constexpr token::kind_type parse_token_unicode_white_space() noexcept
582 {
583 if (ucd_get_lexical_class(_cp & 0x1f'ffff) == unicode_lexical_class::white_space) {
584 capture(_cp);
585 advance_counters();
586 _cp = advance();
587 return token::none;
588
589 } else {
590 _state = state_type::idle;
591 return token::ws;
592 }
593 }
594
595 [[nodiscard]] constexpr token::kind_type parse_token_unicode_idle() noexcept
596 {
597 switch (ucd_get_lexical_class(_cp & 0x1f'ffff)) {
598 case unicode_lexical_class::id_start:
599 _state = state_type::identifier;
600 capture(_cp);
601 advance_counters();
602 _cp = advance();
603 return token::none;
604
605 case unicode_lexical_class::white_space:
606 _state = state_type::white_space;
607 capture(_cp);
608 advance_counters();
609 _cp = advance();
610 return token::none;
611
612 case unicode_lexical_class::syntax:
613 _state = state_type::idle;
614 capture(_cp);
615 advance_counters();
616 _cp = advance();
617 return token::other;
618
619 default:
620 capture(_cp);
621 advance_counters();
622 _cp = advance();
623 return token::error_unexepected_character;
624 }
625 }
626
627 [[nodiscard]] hi_no_inline constexpr token::kind_type parse_token_unicode() noexcept
628 {
629 using enum state_type;
630
631 // Unicode by-pass.
632 switch (_state) {
633 case idle:
634 return parse_token_unicode_idle();
635
636 case white_space:
637 return parse_token_unicode_white_space();
638
639 case line_comment:
640 return parse_token_unicode_line_comment();
641
642 case identifier:
643 return parse_token_unicode_identifier();
644
645 case dqstring_literal:
646 case sqstring_literal:
647 case bqstring_literal:
648 case block_comment:
649 capture(_cp);
650 advance_counters();
651 _cp = advance();
652 return token::none;
653
654 case ini_string:
655 // Line-feeds will terminate an ini-string.
656 if (_cp == U'\u0085' or _cp == U'\u2028' or _cp == U'\u2029') {
657 return token::istr;
658 } else {
659 capture(_cp);
660 advance_counters();
661 _cp = advance();
662 return token::none;
663 }
664
665 default:
666 // Most tokens are terminated when a non-ascii code-point is found.
667 // Terminate these tokens as if we reached end-of-file.
668 return process_command();
669 }
670 }
671
672 [[nodiscard]] constexpr token::kind_type process_command(char c = '\0') noexcept
673 {
674 hilet command = _lexer->get_command(_state, c);
675 _state = command.next_state;
676
677 if (command.clear) {
678 clear();
679 }
680
681 if (command.char_to_capture != '\0') {
682 capture(command.char_to_capture);
683 }
684
685 if (command.advance) {
686 if (command.advance_line) {
687 ++_line_nr;
688 _column_nr = 0;
689 } else if (command.advance_tab) {
690 _column_nr /= 8;
691 ++_column_nr;
692 _column_nr *= 8;
693 } else {
694 ++_column_nr;
695 }
696 _cp = advance();
697 }
698
699 return command.emit_token;
700 }
701
702 [[nodiscard]] constexpr token::kind_type parse_token() noexcept
703 {
704 _token.line_nr = _line_nr;
705 _token.column_nr = _column_nr;
706 clear();
707
708 while (_cp <= 0x7fff'ffff) {
709 if (_cp <= 0x7f) {
710 if (auto token_kind = process_command(char_cast<char>(_cp)); token_kind != token::none) {
711 return token_kind;
712 }
713
714 } else {
715 auto emit_token = parse_token_unicode();
716 if (emit_token != token::none) {
717 return emit_token;
718 }
719 }
720 }
721
722 // Handle trailing state changes at end-of-file.
723 while (_state != state_type::idle) {
724 if (auto token_kind = process_command(); token_kind != token::none) {
725 return token_kind;
726 }
727 }
728
729 // We have finished parsing and there was no token captured.
730 // For example when the end of file only contains white-space.
731 return token::none;
732 }
733 };
734
735 static_assert(std::movable<iterator<std::string::iterator, std::string::iterator>>);
736 static_assert(
737 std::is_same_v<std::iterator_traits<iterator<std::string::iterator, std::string::iterator>>::value_type, token>);
738 static_assert(std::input_or_output_iterator<iterator<std::string::iterator, std::string::iterator>>);
739 static_assert(std::weakly_incrementable<iterator<std::string::iterator, std::string::iterator>>);
740
747 template<typename It, std::sentinel_for<It> ItEnd>
748 [[nodiscard]] constexpr iterator<It, ItEnd> parse(It first, ItEnd last) const noexcept
749 {
750 return iterator<It, ItEnd>{this, first, last};
751 }
752
758 [[nodiscard]] constexpr auto parse(std::string_view str) const noexcept
759 {
760 return parse(str.begin(), str.end());
761 }
762
763private:
767 using transition_table_type = std::array<command_type, std::to_underlying(state_type::_size) * 128>;
768
769 transition_table_type _transition_table;
770
771 constexpr void add_string_literal(
772 char c,
773 token::kind_type string_token,
774 state_type string_literal,
775 state_type string_literal_quote,
776 state_type string_literal_escape) noexcept
777 {
778 using enum state_type;
779
780 add(idle, c, string_literal, advance);
781 add(string_literal, any, idle, token::error_incomplete_string);
782 for (uint8_t i = 1; i != 128; ++i) {
783 if (char_cast<char>(i) != c and char_cast<char>(i) != '\\') {
784 add(string_literal, char_cast<char>(i), string_literal, advance, capture);
785 }
786 }
787
788 if constexpr (Config.escape_by_quote_doubling) {
789 // Don't capture the first quote.
790 add(string_literal, c, string_literal_quote, advance);
791 // If quote is not doubled, this is the end of the string.
792 add(string_literal_quote, any, idle, string_token);
793 // Capture one quote of a doubled quote.
794 add(string_literal_quote, c, string_literal, advance, capture);
795 } else {
796 // Quote ends the string.
797 add(string_literal, c, idle, advance, string_token);
798 }
799
800 // Make sure that any escaped character sequence stays inside the string literal.
801 add(string_literal, '\\', string_literal_escape, advance, capture);
802 add(string_literal_escape, any, idle, token::error_incomplete_string);
803 for (uint8_t i = 1; i != 128; ++i) {
804 add(string_literal_escape, char_cast<char>(i), string_literal, advance, capture);
805 }
806 }
807
808 constexpr void add_string_literals() noexcept
809 {
810 using enum state_type;
811
812 if constexpr (Config.has_single_quote_string_literal) {
813 add_string_literal('\'', token::sstr, sqstring_literal, sqstring_literal_quote, sqstring_literal_escape);
814 } else {
815 add(idle, '\'', idle, token::other, advance, capture);
816 }
817
818 if constexpr (Config.has_double_quote_string_literal) {
819 add_string_literal('"', token::dstr, dqstring_literal, dqstring_literal_quote, dqstring_literal_escape);
820 } else {
821 add(idle, '"', idle, token::other, advance, capture);
822 }
823
824 if constexpr (Config.has_back_quote_string_literal) {
825 add_string_literal('`', token::bstr, bqstring_literal, bqstring_literal_quote, bqstring_literal_escape);
826 } else {
827 add(idle, '`', idle, token::other, advance, capture);
828 }
829 }
830
831 constexpr void add_number_literals() noexcept
832 {
833 using enum state_type;
834
835 add(idle, "0", zero, advance, capture);
836 add(idle, "123456789", dec_integer, advance, capture);
837
838 add(zero, any, idle, token::integer);
839 add(zero, ".", dec_float, advance, capture);
840 add(zero, "b", zero_b, advance);
841 add(zero, "B", zero_B, advance);
842 add(zero, "o", zero_o, advance);
843 add(zero, "O", zero_O, advance);
844 add(zero, "d", zero_d, advance);
845 add(zero, "D", zero_D, advance);
846 add(zero, "x", zero_x, advance);
847 add(zero, "X", zero_X, advance);
848
849 add(zero_b, any, zero_b_id, token::integer);
850 add(zero_B, any, zero_B_id, token::integer);
851 add(zero_o, any, zero_o_id, token::integer);
852 add(zero_O, any, zero_O_id, token::integer);
853 add(zero_d, any, zero_d_id, token::integer);
854 add(zero_D, any, zero_D_id, token::integer);
855 add(zero_x, any, zero_x_id, token::integer);
856 add(zero_X, any, zero_X_id, token::integer);
857 add(zero_b, "0123456789", bin_integer, 'b');
858 add(zero_B, "0123456789", bin_integer, 'B');
859 add(zero_o, "0123456789", oct_integer, 'o');
860 add(zero_O, "0123456789", oct_integer, 'O');
861 add(zero_d, "0123456789", dec_integer, 'd');
862 add(zero_D, "0123456789", dec_integer, 'D');
863 add(zero_x, "0123456789.", hex_integer, 'x');
864 add(zero_X, "0123456789.", hex_integer, 'X');
865
866 add(zero_b_id, any, identifier, 'b');
867 add(zero_B_id, any, identifier, 'B');
868 add(zero_o_id, any, identifier, 'o');
869 add(zero_O_id, any, identifier, 'O');
870 add(zero_d_id, any, identifier, 'd');
871 add(zero_D_id, any, identifier, 'D');
872 add(zero_x_id, any, identifier, 'x');
873 add(zero_X_id, any, identifier, 'X');
874
875 if constexpr (Config.zero_starts_octal) {
876 add(zero, "01234567", oct_integer, advance, capture);
877 add(zero, "89", idle, token::error_invalid_digit);
878 } else {
879 add(zero, "0123456789", dec_integer, advance, capture);
880 }
881
882 // binary-integer
883 add(bin_integer, any, idle, token::integer);
884 add(bin_integer, "01", bin_integer, advance, capture);
885 add(bin_integer, "23456789", idle, token::error_invalid_digit);
886
887 // octal-integer
888 add(oct_integer, any, idle, token::integer);
889 add(oct_integer, "01234567", oct_integer, advance, capture);
890 add(oct_integer, "89", idle, token::error_invalid_digit);
891
892 // decimal-integer
893 add(dec_integer, any, idle, token::integer);
894 add(dec_integer, "0123456789", dec_integer, advance, capture);
895 add(dec_integer, ".", dec_float, advance, capture);
896 add(dec_integer, "e", dec_integer_found_e, advance);
897 add(dec_integer, "E", dec_integer_found_E, advance);
898 add(dec_integer_found_e, any, dec_integer_found_e_id, token::integer);
899 add(dec_integer_found_E, any, dec_integer_found_E_id, token::integer);
900 add(dec_integer_found_e, "+-0123456789", dec_sign_exponent, 'e');
901 add(dec_integer_found_E, "+-0123456789", dec_sign_exponent, 'E');
902 add(dec_integer_found_e_id, any, identifier, 'e');
903 add(dec_integer_found_E_id, any, identifier, 'E');
904
905 // hexadecimal-integer
906 add(hex_integer, any, idle, token::integer);
907 add(hex_integer, "0123456789abcdefABCDEF", hex_integer, advance, capture);
908 add(hex_integer, ".", hex_float, advance, capture);
909 add(hex_integer, "pP", hex_sign_exponent, advance, capture);
910
911 // decimal-float
912 add(found_dot, "0123456789eE", dec_float);
913 add(dec_float, any, idle, token::real);
914 add(dec_float, "0123456789", dec_float, advance, capture);
915 add(dec_float, "e", dec_float_found_e, advance);
916 add(dec_float, "E", dec_float_found_E, advance);
917 add(dec_float_found_e, any, dec_float_found_e_id, token::real);
918 add(dec_float_found_E, any, dec_float_found_E_id, token::real);
919 add(dec_float_found_e, "+-0123456789", dec_sign_exponent, 'e');
920 add(dec_float_found_E, "+-0123456789", dec_sign_exponent, 'E');
921 add(dec_float_found_e_id, any, identifier, 'e');
922 add(dec_float_found_E_id, any, identifier, 'E');
923
924 add(dec_sign_exponent, any, idle, token::error_incomplete_exponent);
925 add(dec_sign_exponent, "0123456789", dec_exponent_more, advance, capture);
926 add(dec_sign_exponent, "+-", dec_exponent, advance, capture);
927 add(dec_exponent, any, idle, token::error_incomplete_exponent);
928 add(dec_exponent, "0123456789", dec_exponent_more, advance, capture);
929 add(dec_exponent_more, any, idle, token::real);
930 add(dec_exponent_more, "0123456789", dec_exponent_more, advance, capture);
931
932 // hexadecimal-float
933 add(hex_float, any, idle, token::real);
934 add(hex_float, "0123456789abcdefABCDEF", hex_float, advance, capture);
935 add(hex_float, "pP", hex_sign_exponent, advance, capture);
936 add(hex_sign_exponent, any, idle, token::error_incomplete_exponent);
937 add(hex_sign_exponent, "0123456789abcdefABCDEF", hex_exponent_more, advance, capture);
938 add(hex_sign_exponent, "+-", hex_exponent, advance, capture);
939 add(hex_exponent, any, idle, token::error_incomplete_exponent);
940 add(hex_exponent, "0123456789abcdefABCDEF", hex_exponent_more, advance, capture);
941 add(hex_exponent_more, any, idle, token::real);
942 add(hex_exponent_more, "0123456789abcdefABCDEF", hex_exponent_more, advance, capture);
943
944 if constexpr (Config.digit_separator != '\0') {
945 if constexpr (Config.zero_starts_octal) {
946 add(zero, Config.digit_separator, oct_integer, advance);
947 } else {
948 add(zero, Config.digit_separator, dec_integer, advance);
949 }
950 add(bin_integer, Config.digit_separator, bin_integer, advance);
951 add(oct_integer, Config.digit_separator, oct_integer, advance);
952 add(dec_integer, Config.digit_separator, dec_integer, advance);
953 add(hex_integer, Config.digit_separator, hex_integer, advance);
954 add(dec_float, Config.digit_separator, dec_integer, advance);
955 add(hex_float, Config.digit_separator, dec_integer, advance);
956 add(dec_exponent, Config.digit_separator, dec_integer, advance);
957 add(hex_exponent, Config.digit_separator, dec_integer, advance);
958 }
959 }
960
961 constexpr void add_color_literal() noexcept
962 {
963 using enum state_type;
964
965 if constexpr (Config.has_color_literal) {
966 add(found_hash, "0123456789abcdefABCDEF", color_literal, clear, capture, advance);
967 add(color_literal, any, idle, token::color);
968 add(color_literal, "0123456789abcdefABCDEF", color_literal, advance, capture);
969 }
970 }
971
972 constexpr void add_ini_assignment() noexcept
973 {
974 using enum state_type;
975
976 if constexpr (Config.equal_is_ini_assignment) {
977 // Ignore white-space
978 add(found_eq, " \t", found_eq, advance);
979 add(found_eq, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", ini_string, token::other);
980 }
981
982 if constexpr (Config.colon_is_ini_assignment) {
983 // Ignore white-space
984 add(found_colon, " \t", found_colon, advance);
985 add(found_colon, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", ini_string, token::other);
986 }
987
988 add(ini_string, any, idle, token::istr);
989 add(ini_string, excluding("\n\v\f\r\0"), ini_string, advance, capture);
990 add(ini_string, '\r', ini_string, advance);
991 }
992
993 constexpr void add_comments() noexcept
994 {
995 using enum state_type;
996
997 if constexpr (Config.has_double_slash_line_comment) {
998 add(found_slash, '/', line_comment, clear, advance);
999 }
1000
1001 if constexpr (Config.has_semicolon_line_comment) {
1002 add(idle, ';', line_comment, advance);
1003 } else {
1004 add(idle, ';', idle, token::other, capture, advance);
1005 }
1006
1007 if constexpr (Config.has_hash_line_comment) {
1008 add(found_hash, excluding("\0"), line_comment, clear, advance, capture);
1009 }
1010
1011 if constexpr (Config.has_c_block_comment) {
1012 add(found_slash, '*', block_comment, advance, clear);
1013 }
1014
1015 if constexpr (Config.has_sgml_block_comment) {
1016 add(found_lt, '!', found_lt_bang, advance);
1017 add(found_lt_bang, any, idle, token::error_after_lt_bang);
1018 add(found_lt_bang, '-', found_lt_bang_dash, advance);
1019 add(found_lt_bang_dash, any, idle, token::error_after_lt_bang);
1020 add(found_lt_bang_dash, '-', block_comment, advance);
1021 }
1022
1023 add(line_comment, any, idle, token::lcomment);
1024 add(line_comment, excluding("\r\n\f\v\0"), line_comment, advance, capture);
1025
1026 add(line_comment, '\r', line_comment, advance);
1027 add(line_comment, "\n\f\v", idle, advance, token::lcomment);
1028
1029 add(block_comment, any, idle, token::error_incomplete_comment);
1030
1031 static_assert(Config.has_c_block_comment == 0 or Config.has_sgml_block_comment == 0);
1032
1033 if constexpr (Config.has_c_block_comment) {
1034 add(block_comment, excluding("*\0"), block_comment, advance, capture);
1035 add(block_comment, '*', block_comment_found_star, advance);
1036 add(block_comment_found_star, any, block_comment, '*');
1037 add(block_comment_found_star, '/', idle, advance, token::bcomment);
1038
1039 } else if constexpr (Config.has_sgml_block_comment) {
1040 add(block_comment, excluding("-\0"), block_comment, advance, capture);
1041 add(block_comment, '-', block_comment_found_dash, advance);
1042 add(block_comment_found_dash, any, block_comment, '-');
1043 add(block_comment_found_dash, '-', block_comment_found_dash_dash, advance);
1044 add(block_comment_found_dash_dash, any, block_comment_found_dash_dash_fin0, '-');
1045 add(block_comment_found_dash_dash_fin0, any, block_comment, '-');
1046 add(block_comment_found_dash_dash, '>', idle, advance, token::bcomment);
1047 }
1048 }
1049
1050 constexpr void add_white_space() noexcept
1051 {
1052 using enum state_type;
1053
1054 add(idle, '\r', white_space, advance);
1055 add(idle, " \n\t\v\f", white_space, advance, capture);
1056 add(white_space, any, idle, token::ws);
1057 add(white_space, '\r', white_space, advance);
1058 add(white_space, " \n\t\v\f", white_space, advance, capture);
1059 }
1060
1061 constexpr void add_identifier() noexcept
1062 {
1063 using enum state_type;
1064
1065 add(idle, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", identifier, advance, capture);
1066 add(identifier, any, idle, token::id);
1067 add(identifier, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789", identifier, advance, capture);
1068 if constexpr (Config.minus_in_identifier) {
1069 add(identifier, '-', identifier, advance, capture);
1070 }
1071 }
1072
1073 constexpr void add_others() noexcept
1074 {
1075 using enum state_type;
1076
1077 // The following characters MUST only exists as single character operators.
1078 add(idle, "()[]{},@$\\", idle, token::other, capture, advance);
1079
1080 // The following characters are the first character of a potential multi-character operator.
1081 add(idle, '+', found_plus, advance, capture);
1082 add(idle, '-', found_minus, advance, capture);
1083 add(idle, '*', found_star, advance, capture);
1084 add(idle, '&', found_and, advance, capture);
1085 add(idle, '|', found_vbar, advance, capture);
1086 add(idle, '^', found_caret, advance, capture);
1087 add(idle, '%', found_percent, advance, capture);
1088 add(idle, '!', found_bang, advance, capture);
1089 add(idle, '?', found_question, advance, capture);
1090 add(idle, '~', found_tilde, advance, capture);
1091 add(idle, '>', found_gt, advance, capture);
1092
1093 add(found_plus, any, idle, token::other);
1094 add(found_minus, any, idle, token::other);
1095 add(found_star, any, idle, token::other);
1096 add(found_and, any, idle, token::other);
1097 add(found_vbar, any, idle, token::other);
1098 add(found_caret, any, idle, token::other);
1099 add(found_percent, any, idle, token::other);
1100 add(found_bang, any, idle, token::other);
1101 add(found_question, any, idle, token::other);
1102 add(found_tilde, any, idle, token::other);
1103 add(found_gt, any, idle, token::other);
1104
1105 // The following characters are the second character of a potential multi-character operator.
1106 add(found_colon, ':', idle, advance, capture, token::other); // ::
1107 if constexpr (Config.has_dot_star_operator) {
1108 add(found_dot, '*', idle, advance, capture, token::other); // .*
1109 }
1110 if constexpr (Config.has_dot_dot_operator) {
1111 add(found_dot, '.', found_dot_dot, advance, capture); // ..
1112 }
1113 add(found_plus, "+=", idle, advance, capture, token::other); // ++, +=
1114 add(found_minus, "-=", idle, advance, capture, token::other); // --, -=
1115 add(found_minus, '>', found_minus_gt, advance, capture); // ->
1116 add(found_star, "*=", idle, advance, capture, token::other); // **, *=
1117 if constexpr (not Config.has_double_slash_line_comment) {
1118 add(found_slash, '/', idle, advance, capture, token::other); // //
1119 }
1120 add(found_slash, '=', idle, advance, capture, token::other); // /=
1121 add(found_and, "&=+-*", idle, advance, capture, token::other); // &&, &=, &+, &-, &*
1122 add(found_vbar, "|=", idle, advance, capture, token::other); // ||, |=
1123 add(found_caret, "^=", idle, advance, capture, token::other); // ^^, ^=
1124 add(found_percent, "%=", idle, advance, capture, token::other); // %%, %=
1125 add(found_bang, '=', idle, advance, capture, token::other); // !=
1126 add(found_question, "?=", idle, advance, capture, token::other); // ??, ?=
1127 add(found_tilde, '=', idle, advance, capture, token::other); // ~=
1128 add(found_lt, '=', found_lt_eq, advance, capture); // <=
1129 add(found_lt, '<', found_lt_lt, advance, capture); // <<
1130 add(found_gt, '=', idle, advance, capture, token::other); // >=
1131 add(found_gt, '>', found_gt_gt, advance, capture); // >>
1132 add(found_eq, '=', found_eq_eq, advance, capture); // ==
1133
1134 add(found_minus_gt, any, idle, token::other);
1135 add(found_dot_dot, any, idle, token::other);
1136 add(found_lt_eq, any, idle, token::other);
1137 add(found_lt_lt, any, idle, token::other);
1138 add(found_gt_gt, any, idle, token::other);
1139 add(found_eq_eq, any, idle, token::other);
1140
1141 // The following characters are the third character of a potential multi-character operator.
1142 add(found_minus_gt, '*', idle, advance, capture, token::other); // ->*
1143 add(found_dot_dot, ".<", idle, advance, capture, token::other); // ..., ..<
1144 add(found_lt_eq, '>', idle, advance, capture, token::other); // <=>
1145 add(found_lt_lt, '=', idle, advance, capture, token::other); // <<=
1146 add(found_gt_gt, '=', idle, advance, capture, token::other); // >>=
1147 add(found_eq_eq, '=', idle, advance, capture, token::other); // ===
1148 }
1149
1150 constexpr command_type& _add(state_type from, char c, state_type to) noexcept
1151 {
1152 auto& command = get_command(from, c);
1153 command.next_state = to;
1154 command.char_to_capture = '\0';
1155 command.advance = 0;
1156 command.advance_line = 0;
1157 command.advance_tab = 0;
1158 command.clear = 0;
1159 command.emit_token = token::none;
1160 return command;
1161 }
1162
1177 template<typename First, typename... Args>
1178 constexpr command_type& _add(state_type from, char c, state_type to, First first, Args const&...args) noexcept
1179 {
1180 auto& command = _add(from, c, to, args...);
1181 if constexpr (std::is_same_v<First, token::kind_type>) {
1182 command.emit_token = first;
1183
1184 } else if constexpr (std::is_same_v<First, advance_tag>) {
1185 command.advance = 1;
1186 if (c == '\n' or c == '\v' or c == '\f') {
1187 command.advance_line = 1;
1188 } else if (c == '\t') {
1189 command.advance_tab = 1;
1190 }
1191
1192 } else if constexpr (std::is_same_v<First, clear_tag>) {
1193 command.clear = 1;
1194
1195 } else if constexpr (std::is_same_v<First, capture_tag>) {
1196 command.char_to_capture = c;
1197
1198 } else if constexpr (std::is_same_v<First, char>) {
1199 command.char_to_capture = first;
1200
1201 } else {
1202 hi_static_no_default();
1203 }
1204
1205 return command;
1206 }
1207
1208 template<typename... Args>
1209 constexpr void add(state_type from, char c, state_type to, Args const&...args) noexcept
1210 {
1211 auto& command = _add(from, c, to, args...);
1212 hi_assert(not command.assigned, "Overwriting an already assigned state:char combination.");
1213 command.assigned = true;
1214 }
1215
1216 template<typename... Args>
1217 constexpr void add(state_type from, std::string_view str, state_type to, Args const&...args) noexcept
1218 {
1219 for (auto c : str) {
1220 auto& command = _add(from, c, to, args...);
1221 hi_assert(not command.assigned, "Overwriting an already assigned state:char combination.");
1222 command.assigned = true;
1223 }
1224 }
1225
1226 template<typename... Args>
1227 constexpr void add(state_type from, any_tag, state_type to, Args const&...args) noexcept
1228 {
1229 static_assert(not has_advance_tag_argument<Args...>(), "any should not advance");
1230
1231 for (uint8_t c = 0; c != 128; ++c) {
1232 hilet& command = _add(from, char_cast<char>(c), to, args...);
1233 hi_assert(not command.assigned, "any should be added first to a state");
1234 }
1235 }
1236
1237 template<typename... Args>
1238 constexpr void add(state_type from, excluding_tag const& exclusions, state_type to, Args const&...args) noexcept
1239 {
1240 for (uint8_t c = 0; c != 128; ++c) {
1241 if (not exclusions.contains(char_cast<char>(c))) {
1242 auto& command = _add(from, char_cast<char>(c), to, args...);
1243 hi_assert(not command.assigned, "Overwriting an already assigned state:char combination.");
1244 command.assigned = true;
1245 }
1246 }
1247 }
1248};
1249
1250} // namespace detail
1251
1252template<lexer_config Config>
1253constexpr auto lexer = detail::lexer<Config>();
1254
1255}} // namespace hi::v1
DOXYGEN BUG.
Definition algorithm.hpp:16
geometry/margins.hpp
Definition lookahead_iterator.hpp:5
@ zero
The number was zero, and this means something in the current language.
constexpr Out narrow_cast(In const &rhs) noexcept
Cast numeric values without loss of precision.
Definition cast.hpp:377
Character encoder/decoder template.
Definition char_converter.hpp:86
Definition lexer.hpp:21
uint16_t escape_by_quote_doubling
Escaping quotes within a string may be done using quote doubling.
Definition lexer.hpp:31
uint16_t equal_is_ini_assignment
The equal '=' character is used for INI-like assignment.S.
Definition lexer.hpp:68
char digit_separator
The character used to separate groups of numbers.
Definition lexer.hpp:91
uint16_t colon_is_ini_assignment
The colon ':' character is used for INI-like assignment.
Definition lexer.hpp:79
uint16_t has_color_literal
The language has a literal color.
Definition lexer.hpp:37
uint16_t zero_starts_octal
A zero starts in octal number.
Definition lexer.hpp:27
uint16_t has_dot_star_operator
The '.
Definition lexer.hpp:54
uint16_t has_dot_dot_operator
The '..', '...' and '..<' operators used in Swift as the range operators.
Definition lexer.hpp:58
A configurable lexical analyzer with unicode Annex #31 support.
Definition lexer.hpp:162
constexpr auto parse(std::string_view str) const noexcept
Parse a string of UTF-8 characters.
Definition lexer.hpp:758
constexpr iterator< It, ItEnd > parse(It first, ItEnd last) const noexcept
Parse a range of UTF-8 characters.
Definition lexer.hpp:748
Definition lexer.hpp:405
Definition lexer.hpp:418
Definition token.hpp:15
Definition concepts.hpp:39
Definition concepts.hpp:42
T addressof(T... args)
T back_inserter(T... args)
T clear(T... args)
T find(T... args)
T move(T... args)
T push_back(T... args)