HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
lexer.hpp
1// Copyright Take Vos 2023.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "token.hpp"
8#include "../utility/utility.hpp"
9#include "../unicode/unicode.hpp"
10#include "../char_maps/char_maps.hpp"
11#include <ranges>
12#include <iterator>
13#include <cstdint>
14#include <string>
15#include <string_view>
16#include <format>
17#include <ostream>
18
19hi_export_module(hikogui.parser.lexer);
20
21hi_export namespace hi { inline namespace v1 {
22
29 uint16_t zero_starts_octal : 1 = 0;
30
33 uint16_t escape_by_quote_doubling : 1 = 0;
34
39 uint16_t has_color_literal : 1 = 0;
40
41 uint16_t has_double_quote_string_literal : 1 = 0;
42 uint16_t has_single_quote_string_literal : 1 = 0;
43 uint16_t has_back_quote_string_literal : 1 = 0;
44
45 uint16_t has_double_slash_line_comment : 1 = 0;
46 uint16_t has_hash_line_comment : 1 = 0;
47 uint16_t has_semicolon_line_comment : 1 = 0;
48
49 uint16_t has_c_block_comment : 1 = 0;
50 uint16_t has_sgml_block_comment : 1 = 0;
51 uint16_t filter_white_space : 1 = 0;
52 uint16_t filter_comment : 1 = 0;
53
56 uint16_t has_dot_star_operator : 1 = 0;
57
60 uint16_t has_dot_dot_operator : 1 = 0;
61
70 uint16_t equal_is_ini_assignment : 1 = 0;
71
81 uint16_t colon_is_ini_assignment : 1 = 0;
82
83 uint16_t minus_in_identifier : 1 = 0;
84
93 char digit_separator = '\0';
94
95 [[nodiscard]] constexpr static lexer_config sh_style() noexcept
96 {
97 auto r = lexer_config{};
98 r.has_single_quote_string_literal = 1;
99 r.has_double_quote_string_literal = 1;
100 r.has_hash_line_comment = 1;
101 r.filter_white_space = 1;
102 r.filter_comment = 1;
103 return r;
104 }
105
106 [[nodiscard]] constexpr static lexer_config json_style() noexcept
107 {
108 auto r = lexer_config{};
109 r.has_single_quote_string_literal = 1;
110 r.has_double_quote_string_literal = 1;
111 r.has_double_slash_line_comment = 1;
112 r.filter_white_space = 1;
113 r.filter_comment = 1;
114 return r;
115 }
116
117 [[nodiscard]] constexpr static lexer_config c_style() noexcept
118 {
119 auto r = lexer_config{};
120 r.filter_white_space = 1;
121 r.zero_starts_octal = 1;
122 r.digit_separator = '\'';
123 r.has_double_quote_string_literal = 1;
124 r.has_single_quote_string_literal = 1;
125 r.has_double_slash_line_comment = 1;
126 r.has_c_block_comment = 1;
127 r.has_dot_star_operator = 1;
128 r.has_dot_dot_operator = 1;
129 return r;
130 }
131
132 [[nodiscard]] constexpr static lexer_config css_style() noexcept
133 {
134 auto r = lexer_config{};
135 r.filter_white_space = 1;
136 r.filter_comment = 1;
137 r.has_color_literal = 1;
138 r.has_double_quote_string_literal = 1;
139 r.has_double_slash_line_comment = 1;
140 r.has_c_block_comment = 1;
141 r.minus_in_identifier = 1;
142 return r;
143 }
144
145 [[nodiscard]] constexpr static lexer_config ini_style() noexcept
146 {
147 auto r = lexer_config{};
148 r.filter_white_space = 1;
149 r.digit_separator = '_';
150 r.has_double_quote_string_literal = 1;
151 r.has_single_quote_string_literal = 1;
152 r.has_semicolon_line_comment = 1;
153 r.has_color_literal = 1;
154 r.equal_is_ini_assignment = 1;
155 return r;
156 }
157};
158
159namespace detail {
160
163template<lexer_config Config>
164class lexer {
165private:
166 enum class state_type : uint8_t {
167 idle,
168 zero,
169 zero_b,
170 zero_B,
171 zero_o,
172 zero_O,
173 zero_d,
174 zero_D,
175 zero_x,
176 zero_X,
177 zero_b_id,
178 zero_B_id,
179 zero_o_id,
180 zero_O_id,
181 zero_d_id,
182 zero_D_id,
183 zero_x_id,
184 zero_X_id,
185 bin_integer,
186 oct_integer,
187 dec_integer,
188 dec_integer_found_e,
189 dec_integer_found_E,
190 dec_integer_found_e_id,
191 dec_integer_found_E_id,
192 hex_integer,
193 dec_float,
194 dec_float_found_e,
195 dec_float_found_E,
196 dec_float_found_e_id,
197 dec_float_found_E_id,
198 hex_float,
199 dec_sign_exponent,
200 hex_sign_exponent,
201 dec_exponent,
202 hex_exponent,
203 dec_exponent_more,
204 hex_exponent_more,
205 color_literal,
206 sqstring_literal,
207 sqstring_literal_quote,
208 sqstring_literal_escape,
209 dqstring_literal,
210 dqstring_literal_quote,
211 dqstring_literal_escape,
212 bqstring_literal,
213 bqstring_literal_quote,
214 bqstring_literal_escape,
215 line_comment,
216 block_comment,
217 block_comment_found_star,
218 block_comment_found_dash,
219 block_comment_found_dash_dash,
220 block_comment_found_dash_dash_fin0,
221 found_colon,
222 found_dot,
223 found_dot_dot,
224 found_eq,
225 found_eq_eq,
226 found_hash,
227 found_lt,
228 found_lt_lt,
229 found_lt_bang,
230 found_lt_bang_dash,
231 found_lt_eq,
232 found_slash,
233 found_plus,
234 found_minus,
235 found_minus_gt,
236 found_star,
237 found_and,
238 found_vbar,
239 found_caret,
240 found_percent,
241 found_bang,
242 found_question,
243 found_tilde,
244 found_gt,
245 found_gt_gt,
246 ini_string,
247 white_space,
248 identifier,
249
250 _size
251 };
252
255 struct command_type {
258 state_type next_state = state_type::idle;
259
262 token::kind_type emit_token = token::none;
263
266 char char_to_capture = '\0';
267
270 uint8_t clear : 1 = 0;
271
274 uint8_t advance : 1 = 0;
275
278 uint8_t assigned : 1 = 0;
279
282 uint8_t advance_line : 1 = 0;
283
286 uint8_t advance_tab : 1 = 0;
287 };
288
289 struct clear_tag {};
290 struct any_tag {};
291 struct advance_tag {};
292 struct capture_tag {};
293
294 class excluding_tag {
295 public:
296 constexpr excluding_tag(std::string exclusions) noexcept : _exclusions(std::move(exclusions)) {}
297
298 [[nodiscard]] constexpr bool contains(char c) const noexcept
299 {
300 return _exclusions.find(c) != _exclusions.npos;
301 }
302
303 private:
304 std::string _exclusions;
305 };
306
309 constexpr static auto capture = capture_tag{};
310
313 constexpr static auto advance = advance_tag{};
314
317 constexpr static auto clear = clear_tag{};
318
321 constexpr static auto any = any_tag{};
322
323 template<size_t N>
324 [[nodiscard]] constexpr excluding_tag excluding(char const (&exclusions)[N]) noexcept
325 {
326 return excluding_tag{std::string(exclusions, N - 1)};
327 }
328
329 template<typename First, typename... Args>
330 [[nodiscard]] constexpr static bool _has_advance_tag_argument() noexcept
331 {
332 if constexpr (std::is_same_v<First, advance_tag>) {
333 return true;
334 } else if constexpr (sizeof...(Args) == 0) {
335 return false;
336 } else {
337 return _has_advance_tag_argument<Args...>();
338 }
339 }
340
341 template<typename... Args>
342 [[nodiscard]] constexpr static bool has_advance_tag_argument() noexcept
343 {
344 if constexpr (sizeof...(Args) == 0) {
345 return false;
346 } else {
347 return _has_advance_tag_argument<Args...>();
348 }
349 }
350
351public:
352 constexpr lexer() noexcept : _transition_table()
353 {
354 using enum state_type;
355
356 add(idle, '/', found_slash, advance, capture);
357 add(idle, '<', found_lt, advance, capture);
358 add(idle, '#', found_hash, advance, capture);
359 add(idle, '.', found_dot, advance, capture);
360 add(idle, '=', found_eq, advance, capture);
361 add(idle, ':', found_colon, advance, capture);
362
363 add(found_slash, any, idle, token::other);
364 add(found_lt, any, idle, token::other);
365 add(found_hash, any, idle, token::other);
366 add(found_dot, any, idle, token::other);
367 add(found_eq, any, idle, token::other);
368 add(found_colon, any, idle, token::other);
369
370 // Adds the starters "\"'`"
371 add_string_literals();
372
373 // Adds the starters "0123456789"
374 add_number_literals();
375
376 add_color_literal();
377 add_comments();
378 add_white_space();
379 add_identifier();
380 add_ini_assignment();
381 add_others();
382
383 // All unused entries of the idle state are unexpected characters.
384 for (uint8_t i = 0; i != 128; ++i) {
385 auto& command = get_command(idle, char_cast<char>(i));
386 if (not command.assigned) {
387 command.assigned = 1;
388 command.advance = 1;
389 // If there are actual null characters in the string then nothing gets captured.
390 command.char_to_capture = char_cast<char>(i);
391 command.emit_token = token::error_unexepected_character;
392 command.next_state = idle;
393 }
394 }
395 }
396
397 [[nodiscard]] constexpr command_type& get_command(state_type from, char c) noexcept
398 {
399 return _transition_table[std::to_underlying(from) * 128_uz + char_cast<size_t>(c)];
400 }
401
402 [[nodiscard]] constexpr command_type const& get_command(state_type from, char c) const noexcept
403 {
404 return _transition_table[std::to_underlying(from) * 128_uz + char_cast<size_t>(c)];
405 }
406
407 struct proxy {
408 using value_type = token;
409 using reference = value_type const&;
410
411 value_type _v;
412
413 reference operator*() const noexcept
414 {
415 return _v;
416 }
417 };
418
419 template<typename It, std::sentinel_for<It> ItEnd>
420 struct iterator {
421 public:
423 using value_type = token;
424 using reference = value_type const&;
425 using pointer = value_type const *;
427
428 constexpr iterator(lexer const *lexer, It first, ItEnd last) noexcept :
429 _lexer(lexer), _first(first), _last(last), _it(first)
430 {
431 _cp = advance();
432 do {
433 _token.kind = parse_token();
434 } while (is_token_filtered(_token));
435 }
436
437 [[nodiscard]] constexpr static bool is_token_filtered(token x) noexcept
438 {
439 return (Config.filter_white_space and x == token::ws) or (Config.filter_comment and x == token::lcomment) or
440 (Config.filter_comment and x == token::bcomment);
441 }
442
443 [[nodiscard]] constexpr reference operator*() const noexcept
444 {
445 return _token;
446 }
447
448 [[nodiscard]] constexpr pointer operator->() const noexcept
449 {
450 return std::addressof(_token);
451 }
452
453 constexpr iterator& operator++() noexcept
454 {
455 hi_axiom(*this != std::default_sentinel);
456 do {
457 _token.kind = parse_token();
458 } while (is_token_filtered(_token));
459 return *this;
460 }
461
462 constexpr proxy operator++(int) noexcept
463 {
464 auto r = proxy{**this};
465 ++(*this);
466 return r;
467 }
468
469 [[nodiscard]] constexpr bool operator==(std::default_sentinel_t) const noexcept
470 {
471 return _token.kind == token::none;
472 }
473
474 private:
475 lexer const *_lexer;
476 It _first;
477 ItEnd _last;
478 It _it;
479 char32_t _cp = 0;
480 token _token;
481 state_type _state = state_type::idle;
482 size_t _line_nr = 0;
483 size_t _column_nr = 0;
484
489 constexpr void clear() noexcept
490 {
491 _token.capture.clear();
492 }
493
498 constexpr void capture(char code_point) noexcept
499 {
500 _token.capture.push_back(code_point);
501 }
502
507 constexpr void capture(char32_t code_point) noexcept
508 {
509 hi_axiom(code_point < 0x7fff'ffff);
510
511 auto out_it = std::back_inserter(_token.capture);
512 char_map<"utf-8">{}.write(code_point, out_it);
513 }
514
515 constexpr void advance_counters() noexcept
516 {
517 if (_cp == '\n' or _cp == '\v' or _cp == '\f' or _cp == '\x85' or _cp == U'\u2028' or _cp == U'\u2029') {
518 ++_line_nr;
519 } else if (_cp == '\t') {
520 _column_nr /= 8;
521 ++_column_nr;
522 _column_nr *= 8;
523 } else {
524 ++_column_nr;
525 }
526 }
527
532 [[nodiscard]] constexpr char32_t advance() noexcept
533 {
534 if (_it == _last) {
535 return 0xffff'ffff;
536 }
537
538 auto const[code_point, valid] = char_map<"utf-8">{}.read(_it, _last);
539 return code_point;
540 }
541
542 [[nodiscard]] constexpr token::kind_type parse_token_unicode_identifier() noexcept
543 {
544 switch (ucd_get_lexical_class(_cp & 0x1f'ffff)) {
545 case unicode_lexical_class::id_start:
546 case unicode_lexical_class::id_continue:
547 capture(_cp);
548 advance_counters();
549 _cp = advance();
550 return token::none;
551
552 default:
553 if (Config.minus_in_identifier and _cp == '-') {
554 capture(_cp);
555 advance_counters();
556 _cp = advance();
557 return token::none;
558
559 } else {
560 _state = state_type::idle;
561 return token::id;
562 }
563 }
564 }
565
566 [[nodiscard]] constexpr token::kind_type parse_token_unicode_line_comment() noexcept
567 {
568 auto const cp_ = _cp & 0x1f'ffff;
569 if (cp_ == U'\u0085' or cp_ == U'\u2028' or cp_ == U'\u2029') {
570 _state = state_type::idle;
571 advance_counters();
572 _cp = advance();
573 return token::lcomment;
574
575 } else {
576 capture(_cp);
577 advance_counters();
578 _cp = advance();
579 return token::none;
580 }
581 }
582
583 [[nodiscard]] constexpr token::kind_type parse_token_unicode_white_space() noexcept
584 {
585 if (ucd_get_lexical_class(_cp & 0x1f'ffff) == unicode_lexical_class::white_space) {
586 capture(_cp);
587 advance_counters();
588 _cp = advance();
589 return token::none;
590
591 } else {
592 _state = state_type::idle;
593 return token::ws;
594 }
595 }
596
597 [[nodiscard]] constexpr token::kind_type parse_token_unicode_idle() noexcept
598 {
599 switch (ucd_get_lexical_class(_cp & 0x1f'ffff)) {
600 case unicode_lexical_class::id_start:
601 _state = state_type::identifier;
602 capture(_cp);
603 advance_counters();
604 _cp = advance();
605 return token::none;
606
607 case unicode_lexical_class::white_space:
608 _state = state_type::white_space;
609 capture(_cp);
610 advance_counters();
611 _cp = advance();
612 return token::none;
613
614 case unicode_lexical_class::syntax:
615 _state = state_type::idle;
616 capture(_cp);
617 advance_counters();
618 _cp = advance();
619 return token::other;
620
621 default:
622 capture(_cp);
623 advance_counters();
624 _cp = advance();
625 return token::error_unexepected_character;
626 }
627 }
628
629 [[nodiscard]] hi_no_inline constexpr token::kind_type parse_token_unicode() noexcept
630 {
631 using enum state_type;
632
633 // Unicode by-pass.
634 switch (_state) {
635 case idle:
636 return parse_token_unicode_idle();
637
638 case white_space:
639 return parse_token_unicode_white_space();
640
641 case line_comment:
642 return parse_token_unicode_line_comment();
643
644 case identifier:
645 return parse_token_unicode_identifier();
646
647 case dqstring_literal:
648 case sqstring_literal:
649 case bqstring_literal:
650 case block_comment:
651 capture(_cp);
652 advance_counters();
653 _cp = advance();
654 return token::none;
655
656 case ini_string:
657 // Line-feeds will terminate an ini-string.
658 if (_cp == U'\u0085' or _cp == U'\u2028' or _cp == U'\u2029') {
659 return token::istr;
660 } else {
661 capture(_cp);
662 advance_counters();
663 _cp = advance();
664 return token::none;
665 }
666
667 default:
668 // Most tokens are terminated when a non-ascii code-point is found.
669 // Terminate these tokens as if we reached end-of-file.
670 return process_command();
671 }
672 }
673
674 [[nodiscard]] constexpr token::kind_type process_command(char c = '\0') noexcept
675 {
676 auto const command = _lexer->get_command(_state, c);
677 _state = command.next_state;
678
679 if (command.clear) {
680 clear();
681 }
682
683 if (command.char_to_capture != '\0') {
684 capture(command.char_to_capture);
685 }
686
687 if (command.advance) {
688 if (command.advance_line) {
689 ++_line_nr;
690 _column_nr = 0;
691 } else if (command.advance_tab) {
692 _column_nr /= 8;
693 ++_column_nr;
694 _column_nr *= 8;
695 } else {
696 ++_column_nr;
697 }
698 _cp = advance();
699 }
700
701 return command.emit_token;
702 }
703
704 [[nodiscard]] constexpr token::kind_type parse_token() noexcept
705 {
706 _token.line_nr = _line_nr;
707 _token.column_nr = _column_nr;
708 clear();
709
710 while (_cp <= 0x7fff'ffff) {
711 if (_cp <= 0x7f) {
712 if (auto token_kind = process_command(char_cast<char>(_cp)); token_kind != token::none) {
713 return token_kind;
714 }
715
716 } else {
717 auto emit_token = parse_token_unicode();
718 if (emit_token != token::none) {
719 return emit_token;
720 }
721 }
722 }
723
724 // Handle trailing state changes at end-of-file.
725 while (_state != state_type::idle) {
726 if (auto token_kind = process_command(); token_kind != token::none) {
727 return token_kind;
728 }
729 }
730
731 // We have finished parsing and there was no token captured.
732 // For example when the end of file only contains white-space.
733 return token::none;
734 }
735 };
736
737 static_assert(std::movable<iterator<std::string::iterator, std::string::iterator>>);
738 static_assert(
739 std::is_same_v<std::iterator_traits<iterator<std::string::iterator, std::string::iterator>>::value_type, token>);
740 static_assert(std::input_or_output_iterator<iterator<std::string::iterator, std::string::iterator>>);
741 static_assert(std::weakly_incrementable<iterator<std::string::iterator, std::string::iterator>>);
742
749 template<typename It, std::sentinel_for<It> ItEnd>
750 [[nodiscard]] constexpr iterator<It, ItEnd> parse(It first, ItEnd last) const noexcept
751 {
752 return iterator<It, ItEnd>{this, first, last};
753 }
754
760 [[nodiscard]] constexpr auto parse(std::string_view str) const noexcept
761 {
762 return parse(str.begin(), str.end());
763 }
764
765private:
769 using transition_table_type = std::array<command_type, std::to_underlying(state_type::_size) * 128>;
770
771 transition_table_type _transition_table;
772
773 constexpr void add_string_literal(
774 char c,
775 token::kind_type string_token,
776 state_type string_literal,
777 state_type string_literal_quote,
778 state_type string_literal_escape) noexcept
779 {
780 using enum state_type;
781
782 add(idle, c, string_literal, advance);
783 add(string_literal, any, idle, token::error_incomplete_string);
784 for (uint8_t i = 1; i != 128; ++i) {
785 if (char_cast<char>(i) != c and char_cast<char>(i) != '\\') {
786 add(string_literal, char_cast<char>(i), string_literal, advance, capture);
787 }
788 }
789
790 if constexpr (Config.escape_by_quote_doubling) {
791 // Don't capture the first quote.
792 add(string_literal, c, string_literal_quote, advance);
793 // If quote is not doubled, this is the end of the string.
794 add(string_literal_quote, any, idle, string_token);
795 // Capture one quote of a doubled quote.
796 add(string_literal_quote, c, string_literal, advance, capture);
797 } else {
798 // Quote ends the string.
799 add(string_literal, c, idle, advance, string_token);
800 }
801
802 // Make sure that any escaped character sequence stays inside the string literal.
803 add(string_literal, '\\', string_literal_escape, advance, capture);
804 add(string_literal_escape, any, idle, token::error_incomplete_string);
805 for (uint8_t i = 1; i != 128; ++i) {
806 add(string_literal_escape, char_cast<char>(i), string_literal, advance, capture);
807 }
808 }
809
810 constexpr void add_string_literals() noexcept
811 {
812 using enum state_type;
813
814 if constexpr (Config.has_single_quote_string_literal) {
815 add_string_literal('\'', token::sstr, sqstring_literal, sqstring_literal_quote, sqstring_literal_escape);
816 } else {
817 add(idle, '\'', idle, token::other, advance, capture);
818 }
819
820 if constexpr (Config.has_double_quote_string_literal) {
821 add_string_literal('"', token::dstr, dqstring_literal, dqstring_literal_quote, dqstring_literal_escape);
822 } else {
823 add(idle, '"', idle, token::other, advance, capture);
824 }
825
826 if constexpr (Config.has_back_quote_string_literal) {
827 add_string_literal('`', token::bstr, bqstring_literal, bqstring_literal_quote, bqstring_literal_escape);
828 } else {
829 add(idle, '`', idle, token::other, advance, capture);
830 }
831 }
832
833 constexpr void add_number_literals() noexcept
834 {
835 using enum state_type;
836
837 add(idle, "0", zero, advance, capture);
838 add(idle, "123456789", dec_integer, advance, capture);
839
840 add(zero, any, idle, token::integer);
841 add(zero, ".", dec_float, advance, capture);
842 add(zero, "b", zero_b, advance);
843 add(zero, "B", zero_B, advance);
844 add(zero, "o", zero_o, advance);
845 add(zero, "O", zero_O, advance);
846 add(zero, "d", zero_d, advance);
847 add(zero, "D", zero_D, advance);
848 add(zero, "x", zero_x, advance);
849 add(zero, "X", zero_X, advance);
850
851 add(zero_b, any, zero_b_id, token::integer);
852 add(zero_B, any, zero_B_id, token::integer);
853 add(zero_o, any, zero_o_id, token::integer);
854 add(zero_O, any, zero_O_id, token::integer);
855 add(zero_d, any, zero_d_id, token::integer);
856 add(zero_D, any, zero_D_id, token::integer);
857 add(zero_x, any, zero_x_id, token::integer);
858 add(zero_X, any, zero_X_id, token::integer);
859 add(zero_b, "0123456789", bin_integer, 'b');
860 add(zero_B, "0123456789", bin_integer, 'B');
861 add(zero_o, "0123456789", oct_integer, 'o');
862 add(zero_O, "0123456789", oct_integer, 'O');
863 add(zero_d, "0123456789", dec_integer, 'd');
864 add(zero_D, "0123456789", dec_integer, 'D');
865 add(zero_x, "0123456789.", hex_integer, 'x');
866 add(zero_X, "0123456789.", hex_integer, 'X');
867
868 add(zero_b_id, any, identifier, 'b');
869 add(zero_B_id, any, identifier, 'B');
870 add(zero_o_id, any, identifier, 'o');
871 add(zero_O_id, any, identifier, 'O');
872 add(zero_d_id, any, identifier, 'd');
873 add(zero_D_id, any, identifier, 'D');
874 add(zero_x_id, any, identifier, 'x');
875 add(zero_X_id, any, identifier, 'X');
876
877 if constexpr (Config.zero_starts_octal) {
878 add(zero, "01234567", oct_integer, advance, capture);
879 add(zero, "89", idle, token::error_invalid_digit);
880 } else {
881 add(zero, "0123456789", dec_integer, advance, capture);
882 }
883
884 // binary-integer
885 add(bin_integer, any, idle, token::integer);
886 add(bin_integer, "01", bin_integer, advance, capture);
887 add(bin_integer, "23456789", idle, token::error_invalid_digit);
888
889 // octal-integer
890 add(oct_integer, any, idle, token::integer);
891 add(oct_integer, "01234567", oct_integer, advance, capture);
892 add(oct_integer, "89", idle, token::error_invalid_digit);
893
894 // decimal-integer
895 add(dec_integer, any, idle, token::integer);
896 add(dec_integer, "0123456789", dec_integer, advance, capture);
897 add(dec_integer, ".", dec_float, advance, capture);
898 add(dec_integer, "e", dec_integer_found_e, advance);
899 add(dec_integer, "E", dec_integer_found_E, advance);
900 add(dec_integer_found_e, any, dec_integer_found_e_id, token::integer);
901 add(dec_integer_found_E, any, dec_integer_found_E_id, token::integer);
902 add(dec_integer_found_e, "+-0123456789", dec_sign_exponent, 'e');
903 add(dec_integer_found_E, "+-0123456789", dec_sign_exponent, 'E');
904 add(dec_integer_found_e_id, any, identifier, 'e');
905 add(dec_integer_found_E_id, any, identifier, 'E');
906
907 // hexadecimal-integer
908 add(hex_integer, any, idle, token::integer);
909 add(hex_integer, "0123456789abcdefABCDEF", hex_integer, advance, capture);
910 add(hex_integer, ".", hex_float, advance, capture);
911 add(hex_integer, "pP", hex_sign_exponent, advance, capture);
912
913 // decimal-float
914 add(found_dot, "0123456789eE", dec_float);
915 add(dec_float, any, idle, token::real);
916 add(dec_float, "0123456789", dec_float, advance, capture);
917 add(dec_float, "e", dec_float_found_e, advance);
918 add(dec_float, "E", dec_float_found_E, advance);
919 add(dec_float_found_e, any, dec_float_found_e_id, token::real);
920 add(dec_float_found_E, any, dec_float_found_E_id, token::real);
921 add(dec_float_found_e, "+-0123456789", dec_sign_exponent, 'e');
922 add(dec_float_found_E, "+-0123456789", dec_sign_exponent, 'E');
923 add(dec_float_found_e_id, any, identifier, 'e');
924 add(dec_float_found_E_id, any, identifier, 'E');
925
926 add(dec_sign_exponent, any, idle, token::error_incomplete_exponent);
927 add(dec_sign_exponent, "0123456789", dec_exponent_more, advance, capture);
928 add(dec_sign_exponent, "+-", dec_exponent, advance, capture);
929 add(dec_exponent, any, idle, token::error_incomplete_exponent);
930 add(dec_exponent, "0123456789", dec_exponent_more, advance, capture);
931 add(dec_exponent_more, any, idle, token::real);
932 add(dec_exponent_more, "0123456789", dec_exponent_more, advance, capture);
933
934 // hexadecimal-float
935 add(hex_float, any, idle, token::real);
936 add(hex_float, "0123456789abcdefABCDEF", hex_float, advance, capture);
937 add(hex_float, "pP", hex_sign_exponent, advance, capture);
938 add(hex_sign_exponent, any, idle, token::error_incomplete_exponent);
939 add(hex_sign_exponent, "0123456789abcdefABCDEF", hex_exponent_more, advance, capture);
940 add(hex_sign_exponent, "+-", hex_exponent, advance, capture);
941 add(hex_exponent, any, idle, token::error_incomplete_exponent);
942 add(hex_exponent, "0123456789abcdefABCDEF", hex_exponent_more, advance, capture);
943 add(hex_exponent_more, any, idle, token::real);
944 add(hex_exponent_more, "0123456789abcdefABCDEF", hex_exponent_more, advance, capture);
945
946 if constexpr (Config.digit_separator != '\0') {
947 if constexpr (Config.zero_starts_octal) {
948 add(zero, Config.digit_separator, oct_integer, advance);
949 } else {
950 add(zero, Config.digit_separator, dec_integer, advance);
951 }
952 add(bin_integer, Config.digit_separator, bin_integer, advance);
953 add(oct_integer, Config.digit_separator, oct_integer, advance);
954 add(dec_integer, Config.digit_separator, dec_integer, advance);
955 add(hex_integer, Config.digit_separator, hex_integer, advance);
956 add(dec_float, Config.digit_separator, dec_integer, advance);
957 add(hex_float, Config.digit_separator, dec_integer, advance);
958 add(dec_exponent, Config.digit_separator, dec_integer, advance);
959 add(hex_exponent, Config.digit_separator, dec_integer, advance);
960 }
961 }
962
963 constexpr void add_color_literal() noexcept
964 {
965 using enum state_type;
966
967 if constexpr (Config.has_color_literal) {
968 add(found_hash, "0123456789abcdefABCDEF", color_literal, clear, capture, advance);
969 add(color_literal, any, idle, token::color);
970 add(color_literal, "0123456789abcdefABCDEF", color_literal, advance, capture);
971 }
972 }
973
974 constexpr void add_ini_assignment() noexcept
975 {
976 using enum state_type;
977
978 if constexpr (Config.equal_is_ini_assignment) {
979 // Ignore white-space
980 add(found_eq, " \t", found_eq, advance);
981 add(found_eq, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", ini_string, token::other);
982 }
983
984 if constexpr (Config.colon_is_ini_assignment) {
985 // Ignore white-space
986 add(found_colon, " \t", found_colon, advance);
987 add(found_colon, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", ini_string, token::other);
988 }
989
990 add(ini_string, any, idle, token::istr);
991 add(ini_string, excluding("\n\v\f\r\0"), ini_string, advance, capture);
992 add(ini_string, '\r', ini_string, advance);
993 }
994
995 constexpr void add_comments() noexcept
996 {
997 using enum state_type;
998
999 if constexpr (Config.has_double_slash_line_comment) {
1000 add(found_slash, '/', line_comment, clear, advance);
1001 }
1002
1003 if constexpr (Config.has_semicolon_line_comment) {
1004 add(idle, ';', line_comment, advance);
1005 } else {
1006 add(idle, ';', idle, token::other, capture, advance);
1007 }
1008
1009 if constexpr (Config.has_hash_line_comment) {
1010 add(found_hash, excluding("\0"), line_comment, clear, advance, capture);
1011 }
1012
1013 if constexpr (Config.has_c_block_comment) {
1014 add(found_slash, '*', block_comment, advance, clear);
1015 }
1016
1017 if constexpr (Config.has_sgml_block_comment) {
1018 add(found_lt, '!', found_lt_bang, advance);
1019 add(found_lt_bang, any, idle, token::error_after_lt_bang);
1020 add(found_lt_bang, '-', found_lt_bang_dash, advance);
1021 add(found_lt_bang_dash, any, idle, token::error_after_lt_bang);
1022 add(found_lt_bang_dash, '-', block_comment, advance);
1023 }
1024
1025 add(line_comment, any, idle, token::lcomment);
1026 add(line_comment, excluding("\r\n\f\v\0"), line_comment, advance, capture);
1027
1028 add(line_comment, '\r', line_comment, advance);
1029 add(line_comment, "\n\f\v", idle, advance, token::lcomment);
1030
1031 add(block_comment, any, idle, token::error_incomplete_comment);
1032
1033 static_assert(Config.has_c_block_comment == 0 or Config.has_sgml_block_comment == 0);
1034
1035 if constexpr (Config.has_c_block_comment) {
1036 add(block_comment, excluding("*\0"), block_comment, advance, capture);
1037 add(block_comment, '*', block_comment_found_star, advance);
1038 add(block_comment_found_star, any, block_comment, '*');
1039 add(block_comment_found_star, '/', idle, advance, token::bcomment);
1040
1041 } else if constexpr (Config.has_sgml_block_comment) {
1042 add(block_comment, excluding("-\0"), block_comment, advance, capture);
1043 add(block_comment, '-', block_comment_found_dash, advance);
1044 add(block_comment_found_dash, any, block_comment, '-');
1045 add(block_comment_found_dash, '-', block_comment_found_dash_dash, advance);
1046 add(block_comment_found_dash_dash, any, block_comment_found_dash_dash_fin0, '-');
1047 add(block_comment_found_dash_dash_fin0, any, block_comment, '-');
1048 add(block_comment_found_dash_dash, '>', idle, advance, token::bcomment);
1049 }
1050 }
1051
1052 constexpr void add_white_space() noexcept
1053 {
1054 using enum state_type;
1055
1056 add(idle, '\r', white_space, advance);
1057 add(idle, " \n\t\v\f", white_space, advance, capture);
1058 add(white_space, any, idle, token::ws);
1059 add(white_space, '\r', white_space, advance);
1060 add(white_space, " \n\t\v\f", white_space, advance, capture);
1061 }
1062
1063 constexpr void add_identifier() noexcept
1064 {
1065 using enum state_type;
1066
1067 add(idle, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", identifier, advance, capture);
1068 add(identifier, any, idle, token::id);
1069 add(identifier, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789", identifier, advance, capture);
1070 if constexpr (Config.minus_in_identifier) {
1071 add(identifier, '-', identifier, advance, capture);
1072 }
1073 }
1074
1075 constexpr void add_others() noexcept
1076 {
1077 using enum state_type;
1078
1079 // The following characters MUST only exists as single character operators.
1080 add(idle, "()[]{},@$\\", idle, token::other, capture, advance);
1081
1082 // The following characters are the first character of a potential multi-character operator.
1083 add(idle, '+', found_plus, advance, capture);
1084 add(idle, '-', found_minus, advance, capture);
1085 add(idle, '*', found_star, advance, capture);
1086 add(idle, '&', found_and, advance, capture);
1087 add(idle, '|', found_vbar, advance, capture);
1088 add(idle, '^', found_caret, advance, capture);
1089 add(idle, '%', found_percent, advance, capture);
1090 add(idle, '!', found_bang, advance, capture);
1091 add(idle, '?', found_question, advance, capture);
1092 add(idle, '~', found_tilde, advance, capture);
1093 add(idle, '>', found_gt, advance, capture);
1094
1095 add(found_plus, any, idle, token::other);
1096 add(found_minus, any, idle, token::other);
1097 add(found_star, any, idle, token::other);
1098 add(found_and, any, idle, token::other);
1099 add(found_vbar, any, idle, token::other);
1100 add(found_caret, any, idle, token::other);
1101 add(found_percent, any, idle, token::other);
1102 add(found_bang, any, idle, token::other);
1103 add(found_question, any, idle, token::other);
1104 add(found_tilde, any, idle, token::other);
1105 add(found_gt, any, idle, token::other);
1106
1107 // The following characters are the second character of a potential multi-character operator.
1108 add(found_colon, ':', idle, advance, capture, token::other); // ::
1109 if constexpr (Config.has_dot_star_operator) {
1110 add(found_dot, '*', idle, advance, capture, token::other); // .*
1111 }
1112 if constexpr (Config.has_dot_dot_operator) {
1113 add(found_dot, '.', found_dot_dot, advance, capture); // ..
1114 }
1115 add(found_plus, "+=", idle, advance, capture, token::other); // ++, +=
1116 add(found_minus, "-=", idle, advance, capture, token::other); // --, -=
1117 add(found_minus, '>', found_minus_gt, advance, capture); // ->
1118 add(found_star, "*=", idle, advance, capture, token::other); // **, *=
1119 if constexpr (not Config.has_double_slash_line_comment) {
1120 add(found_slash, '/', idle, advance, capture, token::other); // //
1121 }
1122 add(found_slash, '=', idle, advance, capture, token::other); // /=
1123 add(found_and, "&=+-*", idle, advance, capture, token::other); // &&, &=, &+, &-, &*
1124 add(found_vbar, "|=", idle, advance, capture, token::other); // ||, |=
1125 add(found_caret, "^=", idle, advance, capture, token::other); // ^^, ^=
1126 add(found_percent, "%=", idle, advance, capture, token::other); // %%, %=
1127 add(found_bang, '=', idle, advance, capture, token::other); // !=
1128 add(found_question, "?=", idle, advance, capture, token::other); // ??, ?=
1129 add(found_tilde, '=', idle, advance, capture, token::other); // ~=
1130 add(found_lt, '=', found_lt_eq, advance, capture); // <=
1131 add(found_lt, '<', found_lt_lt, advance, capture); // <<
1132 add(found_gt, '=', idle, advance, capture, token::other); // >=
1133 add(found_gt, '>', found_gt_gt, advance, capture); // >>
1134 add(found_eq, '=', found_eq_eq, advance, capture); // ==
1135
1136 add(found_minus_gt, any, idle, token::other);
1137 add(found_dot_dot, any, idle, token::other);
1138 add(found_lt_eq, any, idle, token::other);
1139 add(found_lt_lt, any, idle, token::other);
1140 add(found_gt_gt, any, idle, token::other);
1141 add(found_eq_eq, any, idle, token::other);
1142
1143 // The following characters are the third character of a potential multi-character operator.
1144 add(found_minus_gt, '*', idle, advance, capture, token::other); // ->*
1145 add(found_dot_dot, ".<", idle, advance, capture, token::other); // ..., ..<
1146 add(found_lt_eq, '>', idle, advance, capture, token::other); // <=>
1147 add(found_lt_lt, '=', idle, advance, capture, token::other); // <<=
1148 add(found_gt_gt, '=', idle, advance, capture, token::other); // >>=
1149 add(found_eq_eq, '=', idle, advance, capture, token::other); // ===
1150 }
1151
1152 constexpr command_type& _add(state_type from, char c, state_type to) noexcept
1153 {
1154 auto& command = get_command(from, c);
1155 command.next_state = to;
1156 command.char_to_capture = '\0';
1157 command.advance = 0;
1158 command.advance_line = 0;
1159 command.advance_tab = 0;
1160 command.clear = 0;
1161 command.emit_token = token::none;
1162 return command;
1163 }
1164
1179 template<typename First, typename... Args>
1180 constexpr command_type& _add(state_type from, char c, state_type to, First first, Args const&...args) noexcept
1181 {
1182 auto& command = _add(from, c, to, args...);
1183 if constexpr (std::is_same_v<First, token::kind_type>) {
1184 command.emit_token = first;
1185
1186 } else if constexpr (std::is_same_v<First, advance_tag>) {
1187 command.advance = 1;
1188 if (c == '\n' or c == '\v' or c == '\f') {
1189 command.advance_line = 1;
1190 } else if (c == '\t') {
1191 command.advance_tab = 1;
1192 }
1193
1194 } else if constexpr (std::is_same_v<First, clear_tag>) {
1195 command.clear = 1;
1196
1197 } else if constexpr (std::is_same_v<First, capture_tag>) {
1198 command.char_to_capture = c;
1199
1200 } else if constexpr (std::is_same_v<First, char>) {
1201 command.char_to_capture = first;
1202
1203 } else {
1204 hi_static_no_default();
1205 }
1206
1207 return command;
1208 }
1209
1210 template<typename... Args>
1211 constexpr void add(state_type from, char c, state_type to, Args const&...args) noexcept
1212 {
1213 auto& command = _add(from, c, to, args...);
1214 hi_assert(not command.assigned, "Overwriting an already assigned state:char combination.");
1215 command.assigned = true;
1216 }
1217
1218 template<typename... Args>
1219 constexpr void add(state_type from, std::string_view str, state_type to, Args const&...args) noexcept
1220 {
1221 for (auto c : str) {
1222 auto& command = _add(from, c, to, args...);
1223 hi_assert(not command.assigned, "Overwriting an already assigned state:char combination.");
1224 command.assigned = true;
1225 }
1226 }
1227
1228 template<typename... Args>
1229 constexpr void add(state_type from, any_tag, state_type to, Args const&...args) noexcept
1230 {
1231 static_assert(not has_advance_tag_argument<Args...>(), "any should not advance");
1232
1233 for (uint8_t c = 0; c != 128; ++c) {
1234 auto const& command = _add(from, char_cast<char>(c), to, args...);
1235 hi_assert(not command.assigned, "any should be added first to a state");
1236 }
1237 }
1238
1239 template<typename... Args>
1240 constexpr void add(state_type from, excluding_tag const& exclusions, state_type to, Args const&...args) noexcept
1241 {
1242 for (uint8_t c = 0; c != 128; ++c) {
1243 if (not exclusions.contains(char_cast<char>(c))) {
1244 auto& command = _add(from, char_cast<char>(c), to, args...);
1245 hi_assert(not command.assigned, "Overwriting an already assigned state:char combination.");
1246 command.assigned = true;
1247 }
1248 }
1249 }
1250};
1251
1252} // namespace detail
1253
1254template<lexer_config Config>
1255constexpr auto lexer = detail::lexer<Config>();
1256
1257}} // namespace hi::v1
The HikoGUI namespace.
Definition array_generic.hpp:20
@ zero
The number was zero, and this means something in the current language.
DOXYGEN BUG.
Definition algorithm_misc.hpp:20
Character encoder/decoder template.
Definition char_converter.hpp:89
Definition lexer.hpp:23
uint16_t escape_by_quote_doubling
Escaping quotes within a string may be done using quote doubling.
Definition lexer.hpp:33
uint16_t equal_is_ini_assignment
The equal '=' character is used for INI-like assignment.S.
Definition lexer.hpp:70
char digit_separator
The character used to separate groups of numbers.
Definition lexer.hpp:93
uint16_t colon_is_ini_assignment
The colon ':' character is used for INI-like assignment.
Definition lexer.hpp:81
uint16_t has_color_literal
The language has a literal color.
Definition lexer.hpp:39
uint16_t zero_starts_octal
A zero starts in octal number.
Definition lexer.hpp:29
uint16_t has_dot_star_operator
The '.
Definition lexer.hpp:56
uint16_t has_dot_dot_operator
The '..', '...' and '..<' operators used in Swift as the range operators.
Definition lexer.hpp:60
A configurable lexical analyzer with unicode Annex #31 support.
Definition lexer.hpp:164
constexpr auto parse(std::string_view str) const noexcept
Parse a string of UTF-8 characters.
Definition lexer.hpp:760
constexpr iterator< It, ItEnd > parse(It first, ItEnd last) const noexcept
Parse a range of UTF-8 characters.
Definition lexer.hpp:750
Definition lexer.hpp:407
Definition lexer.hpp:420
Definition token.hpp:18
T addressof(T... args)
T back_inserter(T... args)
T clear(T... args)
T find(T... args)
T move(T... args)
T push_back(T... args)