HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
lexer.hpp
1// Copyright Take Vos 2023.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "token.hpp"
8#include "../utility/module.hpp"
9#include "../unicode/module.hpp"
10#include "../char_maps/module.hpp"
11#include <ranges>
12#include <iterator>
13#include <cstdint>
14#include <string>
15#include <string_view>
16#include <format>
17#include <ostream>
18
19namespace hi { inline namespace v1 {
20
27 uint16_t zero_starts_octal : 1 = 0;
28
31 uint16_t escape_by_quote_doubling : 1 = 0;
32
37 uint16_t has_color_literal : 1 = 0;
38
39 uint16_t has_double_quote_string_literal : 1 = 0;
40 uint16_t has_single_quote_string_literal : 1 = 0;
41 uint16_t has_back_quote_string_literal : 1 = 0;
42
43 uint16_t has_double_slash_line_comment : 1 = 0;
44 uint16_t has_hash_line_comment : 1 = 0;
45 uint16_t has_semicolon_line_comment : 1 = 0;
46
47 uint16_t has_c_block_comment : 1 = 0;
48 uint16_t has_sgml_block_comment : 1 = 0;
49 uint16_t filter_white_space : 1 = 0;
50 uint16_t filter_comment : 1 = 0;
51
60 uint16_t equal_is_ini_assignment : 1 = 0;
61
71 uint16_t colon_is_ini_assignment : 1 = 0;
72
73 uint16_t minus_in_identifier : 1 = 0;
74
83 char digit_separator = '\0';
84
85 [[nodiscard]] constexpr static lexer_config c_style() noexcept
86 {
87 auto r = lexer_config{};
88 r.filter_white_space = 1;
89 r.zero_starts_octal = 1;
90 r.digit_separator = '\'';
91 r.has_double_quote_string_literal = 1;
92 r.has_single_quote_string_literal = 1;
93 r.has_double_slash_line_comment = 1;
94 r.has_c_block_comment = 1;
95 return r;
96 }
97
98 [[nodiscard]] constexpr static lexer_config css_style() noexcept
99 {
100 auto r = lexer_config{};
101 r.filter_white_space = 1;
102 r.filter_comment = 1;
103 r.has_color_literal = 1;
104 r.has_double_quote_string_literal = 1;
105 r.has_double_slash_line_comment = 1;
106 r.has_c_block_comment = 1;
107 r.minus_in_identifier = 1;
108 return r;
109 }
110
111 [[nodiscard]] constexpr static lexer_config ini_style() noexcept
112 {
113 auto r = lexer_config{};
114 r.filter_white_space = 1;
115 r.digit_separator = '_';
116 r.has_double_quote_string_literal = 1;
117 r.has_single_quote_string_literal = 1;
118 r.has_semicolon_line_comment = 1;
119 r.has_color_literal = 1;
120 r.equal_is_ini_assignment = 1;
121 return r;
122 }
123};
124
125namespace detail {
126
129template<lexer_config Config>
130class lexer {
131private:
132 enum class state_type : uint8_t {
133 idle,
134 zero,
135 zero_b,
136 zero_B,
137 zero_o,
138 zero_O,
139 zero_d,
140 zero_D,
141 zero_x,
142 zero_X,
143 zero_b_id,
144 zero_B_id,
145 zero_o_id,
146 zero_O_id,
147 zero_d_id,
148 zero_D_id,
149 zero_x_id,
150 zero_X_id,
151 bin_integer,
152 oct_integer,
153 dec_integer,
154 dec_integer_found_e,
155 dec_integer_found_E,
156 dec_integer_found_e_id,
157 dec_integer_found_E_id,
158 hex_integer,
159 dec_float,
160 dec_float_found_e,
161 dec_float_found_E,
162 dec_float_found_e_id,
163 dec_float_found_E_id,
164 hex_float,
165 dec_sign_exponent,
166 hex_sign_exponent,
167 dec_exponent,
168 hex_exponent,
169 dec_exponent_more,
170 hex_exponent_more,
171 color_literal,
172 sqstring_literal,
173 sqstring_literal_quote,
174 sqstring_literal_escape,
175 dqstring_literal,
176 dqstring_literal_quote,
177 dqstring_literal_escape,
178 bqstring_literal,
179 bqstring_literal_quote,
180 bqstring_literal_escape,
181 line_comment,
182 block_comment,
183 block_comment_found_star,
184 block_comment_found_dash,
185 block_comment_found_dash_dash,
186 block_comment_found_dash_dash_fin0,
187 found_colon,
188 found_dot,
189 found_eq,
190 found_hash,
191 found_lt,
192 found_lt_bang,
193 found_lt_bang_dash,
194 found_lt_eq,
195 found_slash,
196 ini_string,
197 white_space,
198 identifier,
199
200 _size
201 };
202
205 struct command_type {
208 state_type next_state = state_type::idle;
209
212 token::kind_type emit_token = token::none;
213
216 char char_to_capture = '\0';
217
220 uint8_t clear : 1 = 0;
221
224 uint8_t advance : 1 = 0;
225
228 uint8_t assigned : 1 = 0;
229
232 uint8_t advance_line : 1 = 0;
233
236 uint8_t advance_tab : 1 = 0;
237 };
238
239 struct clear_tag {};
240 struct any_tag {};
241 struct advance_tag {};
242 struct capture_tag {};
243
244 class excluding_tag {
245 public:
246 constexpr excluding_tag(std::string exclusions) noexcept : _exclusions(std::move(exclusions)) {}
247
248 [[nodiscard]] constexpr bool contains(char c) const noexcept
249 {
250 return _exclusions.find(c) != _exclusions.npos;
251 }
252
253 private:
254 std::string _exclusions;
255 };
256
259 constexpr static auto capture = capture_tag{};
260
263 constexpr static auto advance = advance_tag{};
264
267 constexpr static auto clear = clear_tag{};
268
271 constexpr static auto any = any_tag{};
272
273 template<size_t N>
274 [[nodiscard]] constexpr excluding_tag excluding(char const (&exclusions)[N]) noexcept
275 {
276 return excluding_tag{std::string(exclusions, N - 1)};
277 }
278
279 template<typename First, typename... Args>
280 [[nodiscard]] constexpr static bool _has_advance_tag_argument() noexcept
281 {
282 if constexpr (std::is_same_v<First, advance_tag>) {
283 return true;
284 } else if constexpr (sizeof...(Args) == 0) {
285 return false;
286 } else {
287 return _has_advance_tag_argument<Args...>();
288 }
289 }
290
291 template<typename... Args>
292 [[nodiscard]] constexpr static bool has_advance_tag_argument() noexcept
293 {
294 if constexpr (sizeof...(Args) == 0) {
295 return false;
296 } else {
297 return _has_advance_tag_argument<Args...>();
298 }
299 }
300
301public:
302 constexpr lexer() noexcept : _transition_table()
303 {
304 using enum state_type;
305
306 add(idle, '/', found_slash, advance, capture);
307 add(idle, '<', found_lt, advance, capture);
308 add(idle, '#', found_hash, advance, capture);
309 add(idle, '.', found_dot, advance, capture);
310 add(idle, '=', found_eq, advance, capture);
311 add(idle, ':', found_colon, advance, capture);
312
313 add(found_slash, any, idle, token::other);
314 add(found_lt, any, idle, token::other);
315 add(found_hash, any, idle, token::other);
316 add(found_dot, any, idle, token::other);
317 add(found_eq, any, idle, token::other);
318 add(found_colon, any, idle, token::other);
319
320 // Adds the starters "\"'`"
321 add_string_literals();
322
323 // Adds the starters "0123456789"
324 add_number_literals();
325
326 add_color_literal();
327 add_comments();
328 add_white_space();
329 add_identifier();
330 add_ini_assignment();
331
332 add(idle, "~!@$%^&*()-+[]{}\\|,>?", idle, token::other, capture, advance);
333
334 // All unused entries of the idle state are unexpected characters.
335 for (uint8_t i = 0; i != 128; ++i) {
336 auto& command = get_command(idle, char_cast<char>(i));
337 if (not command.assigned) {
338 command.assigned = 1;
339 command.advance = 1;
340 // If there are actual null characters in the string then nothing gets captured.
341 command.char_to_capture = char_cast<char>(i);
342 command.emit_token = token::error_unexepected_character;
343 command.next_state = idle;
344 }
345 }
346 }
347
348 [[nodiscard]] constexpr command_type& get_command(state_type from, char c) noexcept
349 {
350 return _transition_table[to_underlying(from) * 128_uz + char_cast<size_t>(c)];
351 }
352
353 [[nodiscard]] constexpr command_type const& get_command(state_type from, char c) const noexcept
354 {
355 return _transition_table[to_underlying(from) * 128_uz + char_cast<size_t>(c)];
356 }
357
358 template<typename It, std::sentinel_for<It> ItEnd>
359 struct iterator {
360 public:
362 using value_type = token;
363 using reference = value_type const &;
364 using pointer = value_type const *;
366
367 constexpr iterator(lexer const *lexer, It first, ItEnd last) noexcept :
368 _lexer(lexer), _first(first), _last(last), _it(first)
369 {
370 _cp = advance();
371 do {
372 _token.kind = parse_token();
373 } while (Config.filter_white_space and _token.kind == token::ws);
374 }
375
376 [[nodiscard]] constexpr reference operator*() const noexcept
377 {
378 return _token;
379 }
380
381 [[nodiscard]] constexpr pointer operator&() const noexcept
382 {
383 return std::addressof(_token);
384 }
385
386 constexpr iterator& operator++() noexcept
387 {
388 hi_axiom(*this != std::default_sentinel);
389 do {
390 _token.kind = parse_token();
391 } while (Config.filter_white_space and _token.kind == token::ws);
392 return *this;
393 }
394
395 constexpr void operator++(int) noexcept
396 {
397 ++(*this);
398 }
399
400 [[nodiscard]] constexpr bool operator==(std::default_sentinel_t) const noexcept
401 {
402 return _token.kind == token::none;
403 }
404
405 private:
406 lexer const *_lexer;
407 It _first;
408 ItEnd _last;
409 It _it;
410 char32_t _cp = 0;
411 token _token;
412 state_type _state = state_type::idle;
413 size_t _line_nr = 0;
414 size_t _column_nr = 0;
415
420 constexpr void clear() noexcept
421 {
422 _token.capture.clear();
423 }
424
429 constexpr void capture(char code_point) noexcept
430 {
431 _token.capture.push_back(code_point);
432 }
433
438 constexpr void capture(char32_t code_point) noexcept
439 {
440 hi_axiom(code_point < 0x7fff'ffff);
441
442 auto out_it = std::back_inserter(_token.capture);
443 char_map<"utf-8">{}.write(code_point, out_it);
444 }
445
446 constexpr void advance_counters() noexcept
447 {
448 if (_cp == '\n' or _cp == '\v' or _cp == '\f' or _cp == '\x85' or _cp == U'\u2028' or _cp == U'\u2029') {
449 ++_line_nr;
450 } else if (_cp == '\t') {
451 _column_nr /= 8;
452 ++_column_nr;
453 _column_nr *= 8;
454 } else {
455 ++_column_nr;
456 }
457 }
458
463 [[nodiscard]] constexpr char32_t advance() noexcept
464 {
465 if (_it == _last) {
466 return 0xffff'ffff;
467 }
468
469 hilet[code_point, valid] = char_map<"utf-8">{}.read(_it, _last);
470 return code_point;
471 }
472
473 [[nodiscard]] constexpr token::kind_type parse_token_unicode_identifier() noexcept
474 {
475 switch (ucd_get_lexical_class(_cp & 0x1f'ffff)) {
476 case unicode_lexical_class::id_start:
477 case unicode_lexical_class::id_continue:
478 capture(_cp);
479 advance_counters();
480 _cp = advance();
481 return token::none;
482
483 default:
484 if (Config.minus_in_identifier and _cp == '-') {
485 capture(_cp);
486 advance_counters();
487 _cp = advance();
488 return token::none;
489
490 } else {
491 _state = state_type::idle;
492 return token::id;
493 }
494 }
495 }
496
497 [[nodiscard]] constexpr token::kind_type parse_token_unicode_line_comment() noexcept
498 {
499 hilet cp_ = _cp & 0x1f'ffff;
500 if (cp_ == U'\u0085' or cp_ == U'\u2028' or cp_ == U'\u2029') {
501 _state = state_type::idle;
502 advance_counters();
503 _cp = advance();
504 return token::lcomment;
505
506 } else {
507 capture(_cp);
508 advance_counters();
509 _cp = advance();
510 return token::none;
511 }
512 }
513
514 [[nodiscard]] constexpr token::kind_type parse_token_unicode_white_space() noexcept
515 {
516 if (ucd_get_lexical_class(_cp & 0x1f'ffff) == unicode_lexical_class::white_space) {
517 capture(_cp);
518 advance_counters();
519 _cp = advance();
520 return token::none;
521
522 } else {
523 _state = state_type::idle;
524 return token::ws;
525 }
526 }
527
528 [[nodiscard]] constexpr token::kind_type parse_token_unicode_idle() noexcept
529 {
530 switch (ucd_get_lexical_class(_cp & 0x1f'ffff)) {
531 case unicode_lexical_class::id_start:
532 _state = state_type::identifier;
533 capture(_cp);
534 advance_counters();
535 _cp = advance();
536 return token::none;
537
538 case unicode_lexical_class::white_space:
539 _state = state_type::white_space;
540 capture(_cp);
541 advance_counters();
542 _cp = advance();
543 return token::none;
544
545 case unicode_lexical_class::syntax:
546 _state = state_type::idle;
547 capture(_cp);
548 advance_counters();
549 _cp = advance();
550 return token::other;
551
552 default:
553 capture(_cp);
554 advance_counters();
555 _cp = advance();
556 return token::error_unexepected_character;
557 }
558 }
559
560 [[nodiscard]] hi_no_inline constexpr token::kind_type parse_token_unicode() noexcept
561 {
562 using enum state_type;
563
564 // Unicode by-pass.
565 switch (_state) {
566 case idle:
567 return parse_token_unicode_idle();
568
569 case white_space:
570 return parse_token_unicode_white_space();
571
572 case line_comment:
573 return parse_token_unicode_line_comment();
574
575 case identifier:
576 return parse_token_unicode_identifier();
577
578 case dqstring_literal:
579 case sqstring_literal:
580 case bqstring_literal:
581 case block_comment:
582 capture(_cp);
583 advance_counters();
584 _cp = advance();
585 return token::none;
586
587 case ini_string:
588 // Line-feeds will terminate an ini-string.
589 if (_cp == U'\u0085' or _cp == U'\u2028' or _cp == U'\u2029') {
590 return token::istr;
591 } else {
592 capture(_cp);
593 advance_counters();
594 _cp = advance();
595 return token::none;
596 }
597
598 default:
599 // Most tokens are terminated when a non-ascii code-point is found.
600 // Terminate these tokens as if we reached end-of-file.
601 return process_command();
602 }
603 }
604
605 [[nodiscard]] constexpr token::kind_type process_command(char c = '\0') noexcept
606 {
607 hilet command = _lexer->get_command(_state, c);
608 _state = command.next_state;
609
610 if (command.clear) {
611 clear();
612 }
613
614 if (command.char_to_capture != '\0') {
615 capture(command.char_to_capture);
616 }
617
618 if (command.advance) {
619 if (command.advance_line) {
620 ++_line_nr;
621 _column_nr = 0;
622 } else if (command.advance_tab) {
623 _column_nr /= 8;
624 ++_column_nr;
625 _column_nr *= 8;
626 } else {
627 ++_column_nr;
628 }
629 _cp = advance();
630 }
631
632 return command.emit_token;
633 }
634
635 [[nodiscard]] constexpr token::kind_type parse_token() noexcept
636 {
637 _token.line_nr = _line_nr;
638 _token.column_nr = _column_nr;
639 clear();
640
641 while (_cp <= 0x7fff'ffff) {
642 if (_cp <= 0x7f) {
643 if (auto token_kind = process_command(char_cast<char>(_cp)); token_kind != token::none) {
644 return token_kind;
645 }
646
647 } else {
648 auto emit_token = parse_token_unicode();
649 if (emit_token != token::none) {
650 return emit_token;
651 }
652 }
653 }
654
655 // Handle trailing state changes at end-of-file.
656 while (_state != state_type::idle) {
657 if (auto token_kind = process_command(); token_kind != token::none) {
658 return token_kind;
659 }
660 }
661
662 // We have finished parsing and there was no token captured.
663 // For example when the end of file only contains white-space.
664 return token::none;
665 }
666 };
667
668 static_assert(std::movable<iterator<std::string::iterator, std::string::iterator>>);
669 static_assert(std::is_same_v<std::iterator_traits<iterator<std::string::iterator, std::string::iterator>>::value_type, token>);
670 static_assert(std::input_or_output_iterator<iterator<std::string::iterator, std::string::iterator>>);
671 static_assert(std::weakly_incrementable<iterator<std::string::iterator, std::string::iterator>>);
672
679 template<typename It, std::sentinel_for<It> ItEnd>
680 [[nodiscard]] constexpr iterator<It, ItEnd> parse(It first, ItEnd last) const noexcept
681 {
682 return iterator<It, ItEnd>{this, first, last};
683 }
684
690 [[nodiscard]] constexpr auto parse(std::string_view str) const noexcept
691 {
692 return parse(str.begin(), str.end());
693 }
694
695private:
699 using transition_table_type = std::array<command_type, to_underlying(state_type::_size) * 128>;
700
701 transition_table_type _transition_table;
702
703 constexpr void add_string_literal(
704 char c,
705 token::kind_type string_token,
706 state_type string_literal,
707 state_type string_literal_quote,
708 state_type string_literal_escape) noexcept
709 {
710 using enum state_type;
711
712 add(idle, c, string_literal, advance);
713 add(string_literal, any, idle, token::error_incomplete_string);
714 for (uint8_t i = 1; i != 128; ++i) {
715 if (char_cast<char>(i) != c and char_cast<char>(i) != '\\') {
716 add(string_literal, char_cast<char>(i), string_literal, advance, capture);
717 }
718 }
719
720 if constexpr (Config.escape_by_quote_doubling) {
721 // Don't capture the first quote.
722 add(string_literal, c, string_literal_quote, advance);
723 // If quote is not doubled, this is the end of the string.
724 add(string_literal_quote, any, idle, string_token);
725 // Capture one quote of a doubled quote.
726 add(string_literal_quote, c, string_literal, advance, capture);
727 } else {
728 // Quote ends the string.
729 add(string_literal, c, idle, advance, string_token);
730 }
731
732 // Make sure that any escaped character sequence stays inside the string literal.
733 add(string_literal, '\\', string_literal_escape, advance, capture);
734 add(string_literal_escape, any, idle, token::error_incomplete_string);
735 for (uint8_t i = 1; i != 128; ++i) {
736 add(string_literal_escape, char_cast<char>(i), string_literal, advance, capture);
737 }
738 }
739
740 constexpr void add_string_literals() noexcept
741 {
742 using enum state_type;
743
744 if constexpr (Config.has_single_quote_string_literal) {
745 add_string_literal('\'', token::sstr, sqstring_literal, sqstring_literal_quote, sqstring_literal_escape);
746 } else {
747 add(idle, '\'', idle, token::other, advance, capture);
748 }
749
750 if constexpr (Config.has_double_quote_string_literal) {
751 add_string_literal('"', token::dstr, dqstring_literal, dqstring_literal_quote, dqstring_literal_escape);
752 } else {
753 add(idle, '"', idle, token::other, advance, capture);
754 }
755
756 if constexpr (Config.has_back_quote_string_literal) {
757 add_string_literal('`', token::bstr, bqstring_literal, bqstring_literal_quote, bqstring_literal_escape);
758 } else {
759 add(idle, '`', idle, token::other, advance, capture);
760 }
761 }
762
763 constexpr void add_number_literals() noexcept
764 {
765 using enum state_type;
766
767 add(idle, "0", zero, advance, capture);
768 add(idle, "123456789", dec_integer, advance, capture);
769
770 add(zero, any, idle, token::integer);
771 add(zero, ".", dec_float, advance, capture);
772 add(zero, "b", zero_b, advance);
773 add(zero, "B", zero_B, advance);
774 add(zero, "o", zero_o, advance);
775 add(zero, "O", zero_O, advance);
776 add(zero, "d", zero_d, advance);
777 add(zero, "D", zero_D, advance);
778 add(zero, "x", zero_x, advance);
779 add(zero, "X", zero_X, advance);
780
781 add(zero_b, any, zero_b_id, token::integer);
782 add(zero_B, any, zero_B_id, token::integer);
783 add(zero_o, any, zero_o_id, token::integer);
784 add(zero_O, any, zero_O_id, token::integer);
785 add(zero_d, any, zero_d_id, token::integer);
786 add(zero_D, any, zero_D_id, token::integer);
787 add(zero_x, any, zero_x_id, token::integer);
788 add(zero_X, any, zero_X_id, token::integer);
789 add(zero_b, "0123456789", bin_integer, 'b');
790 add(zero_B, "0123456789", bin_integer, 'B');
791 add(zero_o, "0123456789", oct_integer, 'o');
792 add(zero_O, "0123456789", oct_integer, 'O');
793 add(zero_d, "0123456789", dec_integer, 'd');
794 add(zero_D, "0123456789", dec_integer, 'D');
795 add(zero_x, "0123456789.", hex_integer, 'x');
796 add(zero_X, "0123456789.", hex_integer, 'X');
797
798 add(zero_b_id, any, identifier, 'b');
799 add(zero_B_id, any, identifier, 'B');
800 add(zero_o_id, any, identifier, 'o');
801 add(zero_O_id, any, identifier, 'O');
802 add(zero_d_id, any, identifier, 'd');
803 add(zero_D_id, any, identifier, 'D');
804 add(zero_x_id, any, identifier, 'x');
805 add(zero_X_id, any, identifier, 'X');
806
807 if constexpr (Config.zero_starts_octal) {
808 add(zero, "01234567", oct_integer, advance, capture);
809 add(zero, "89", idle, token::error_invalid_digit);
810 } else {
811 add(zero, "0123456789", dec_integer, advance, capture);
812 }
813
814 // binary-integer
815 add(bin_integer, any, idle, token::integer);
816 add(bin_integer, "01", bin_integer, advance, capture);
817 add(bin_integer, "23456789", idle, token::error_invalid_digit);
818
819 // octal-integer
820 add(oct_integer, any, idle, token::integer);
821 add(oct_integer, "01234567", oct_integer, advance, capture);
822 add(oct_integer, "89", idle, token::error_invalid_digit);
823
824 // decimal-integer
825 add(dec_integer, any, idle, token::integer);
826 add(dec_integer, "0123456789", dec_integer, advance, capture);
827 add(dec_integer, ".", dec_float, advance, capture);
828 add(dec_integer, "e", dec_integer_found_e, advance);
829 add(dec_integer, "E", dec_integer_found_E, advance);
830 add(dec_integer_found_e, any, dec_integer_found_e_id, token::integer);
831 add(dec_integer_found_E, any, dec_integer_found_E_id, token::integer);
832 add(dec_integer_found_e, "+-0123456789", dec_sign_exponent, 'e');
833 add(dec_integer_found_E, "+-0123456789", dec_sign_exponent, 'E');
834 add(dec_integer_found_e_id, any, identifier, 'e');
835 add(dec_integer_found_E_id, any, identifier, 'E');
836
837 // hexadecimal-integer
838 add(hex_integer, any, idle, token::integer);
839 add(hex_integer, "0123456789abcdefABCDEF", hex_integer, advance, capture);
840 add(hex_integer, ".", hex_float, advance, capture);
841 add(hex_integer, "pP", hex_sign_exponent, advance, capture);
842
843 // decimal-float
844 add(found_dot, "0123456789eE", dec_float);
845 add(dec_float, any, idle, token::real);
846 add(dec_float, "0123456789", dec_float, advance, capture);
847 add(dec_float, "e", dec_float_found_e, advance);
848 add(dec_float, "E", dec_float_found_E, advance);
849 add(dec_float_found_e, any, dec_float_found_e_id, token::real);
850 add(dec_float_found_E, any, dec_float_found_E_id, token::real);
851 add(dec_float_found_e, "+-0123456789", dec_sign_exponent, 'e');
852 add(dec_float_found_E, "+-0123456789", dec_sign_exponent, 'E');
853 add(dec_float_found_e_id, any, identifier, 'e');
854 add(dec_float_found_E_id, any, identifier, 'E');
855
856 add(dec_sign_exponent, any, idle, token::error_incomplete_exponent);
857 add(dec_sign_exponent, "0123456789", dec_exponent_more, advance, capture);
858 add(dec_sign_exponent, "+-", dec_exponent, advance, capture);
859 add(dec_exponent, any, idle, token::error_incomplete_exponent);
860 add(dec_exponent, "0123456789", dec_exponent_more, advance, capture);
861 add(dec_exponent_more, any, idle, token::real);
862 add(dec_exponent_more, "0123456789", dec_exponent_more, advance, capture);
863
864 // hexadecimal-float
865 add(hex_float, any, idle, token::real);
866 add(hex_float, "0123456789abcdefABCDEF", hex_float, advance, capture);
867 add(hex_float, "pP", hex_sign_exponent, advance, capture);
868 add(hex_sign_exponent, any, idle, token::error_incomplete_exponent);
869 add(hex_sign_exponent, "0123456789abcdefABCDEF", hex_exponent_more, advance, capture);
870 add(hex_sign_exponent, "+-", hex_exponent, advance, capture);
871 add(hex_exponent, any, idle, token::error_incomplete_exponent);
872 add(hex_exponent, "0123456789abcdefABCDEF", hex_exponent_more, advance, capture);
873 add(hex_exponent_more, any, idle, token::real);
874 add(hex_exponent_more, "0123456789abcdefABCDEF", hex_exponent_more, advance, capture);
875
876 if constexpr (Config.digit_separator != '\0') {
877 if constexpr (Config.zero_starts_octal) {
878 add(zero, Config.digit_separator, oct_integer, advance);
879 } else {
880 add(zero, Config.digit_separator, dec_integer, advance);
881 }
882 add(bin_integer, Config.digit_separator, bin_integer, advance);
883 add(oct_integer, Config.digit_separator, oct_integer, advance);
884 add(dec_integer, Config.digit_separator, dec_integer, advance);
885 add(hex_integer, Config.digit_separator, hex_integer, advance);
886 add(dec_float, Config.digit_separator, dec_integer, advance);
887 add(hex_float, Config.digit_separator, dec_integer, advance);
888 add(dec_exponent, Config.digit_separator, dec_integer, advance);
889 add(hex_exponent, Config.digit_separator, dec_integer, advance);
890 }
891 }
892
893 constexpr void add_color_literal() noexcept
894 {
895 using enum state_type;
896
897 if constexpr (Config.has_color_literal) {
898 add(found_hash, "0123456789abcdefABCDEF", color_literal, clear, capture, advance);
899 add(color_literal, any, idle, token::color);
900 add(color_literal, "0123456789abcdefABCDEF", color_literal, advance, capture);
901 }
902 }
903
904 constexpr void add_ini_assignment() noexcept
905 {
906 using enum state_type;
907
908 if constexpr (Config.equal_is_ini_assignment) {
909 // Ignore white-space
910 add(found_eq, " \t", found_eq, advance);
911 add(found_eq, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", ini_string, token::other);
912 }
913
914 if constexpr (Config.colon_is_ini_assignment) {
915 // Ignore white-space
916 add(found_colon, " \t", found_colon, advance);
917 add(found_colon, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", ini_string, token::other);
918 }
919
920 add(ini_string, any, idle, token::istr);
921 add(ini_string, excluding("\n\v\f\r\0"), ini_string, advance, capture);
922 add(ini_string, '\r', ini_string, advance);
923 }
924
925 constexpr void add_comments() noexcept
926 {
927 using enum state_type;
928
929 if constexpr (Config.has_double_slash_line_comment) {
930 add(found_slash, '/', line_comment, clear, advance);
931 }
932
933 if constexpr (Config.has_semicolon_line_comment) {
934 add(idle, ';', line_comment, advance);
935 } else {
936 add(idle, ';', idle, token::other, capture, advance);
937 }
938
939 if constexpr (Config.has_hash_line_comment) {
940 add(found_hash, excluding("\0"), line_comment, clear, advance, capture);
941 }
942
943 if constexpr (Config.has_c_block_comment) {
944 add(found_slash, '*', block_comment, advance, clear);
945 }
946
947 if constexpr (Config.has_sgml_block_comment) {
948 add(found_lt, '!', found_lt_bang, advance);
949 add(found_lt_bang, any, idle, token::error_after_lt_bang);
950 add(found_lt_bang, '-', found_lt_bang_dash, advance);
951 add(found_lt_bang_dash, any, idle, token::error_after_lt_bang);
952 add(found_lt_bang_dash, '-', block_comment, advance);
953 }
954
955 add(line_comment, any, idle, token::lcomment);
956 add(line_comment, excluding("\r\n\f\v\0"), line_comment, advance, capture);
957
958 add(line_comment, '\r', line_comment, advance);
959 add(line_comment, "\n\f\v", idle, advance, token::lcomment);
960
961 add(block_comment, any, idle, token::error_incomplete_comment);
962
963 static_assert(Config.has_c_block_comment == 0 or Config.has_sgml_block_comment == 0);
964
965 if constexpr (Config.has_c_block_comment) {
966 add(block_comment, excluding("*\0"), block_comment, advance, capture);
967 add(block_comment, '*', block_comment_found_star, advance);
968 add(block_comment_found_star, any, block_comment, '*');
969 add(block_comment_found_star, '/', idle, advance, token::bcomment);
970
971 } else if constexpr (Config.has_sgml_block_comment) {
972 add(block_comment, excluding("-\0"), block_comment, advance, capture);
973 add(block_comment, '-', block_comment_found_dash, advance);
974 add(block_comment_found_dash, any, block_comment, '-');
975 add(block_comment_found_dash, '-', block_comment_found_dash_dash, advance);
976 add(block_comment_found_dash_dash, any, block_comment_found_dash_dash_fin0, '-');
977 add(block_comment_found_dash_dash_fin0, any, block_comment, '-');
978 add(block_comment_found_dash_dash, '>', idle, advance, token::bcomment);
979 }
980 }
981
982 constexpr void add_white_space() noexcept
983 {
984 using enum state_type;
985
986 add(idle, '\r', white_space, advance);
987 add(idle, " \n\t\v\f", white_space, advance, capture);
988 add(white_space, any, idle, token::ws);
989 add(white_space, '\r', white_space, advance);
990 add(white_space, " \n\t\v\f", white_space, advance, capture);
991 }
992
993 constexpr void add_identifier() noexcept
994 {
995 using enum state_type;
996
997 add(idle, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", identifier, advance, capture);
998 add(identifier, any, idle, token::id);
999 add(identifier, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789", identifier, advance, capture);
1000 if constexpr (Config.minus_in_identifier) {
1001 add(identifier, '-', identifier, advance, capture);
1002 }
1003 }
1004
1005 constexpr command_type& _add(state_type from, char c, state_type to) noexcept
1006 {
1007 auto& command = get_command(from, c);
1008 command.next_state = to;
1009 command.char_to_capture = '\0';
1010 command.advance = 0;
1011 command.advance_line = 0;
1012 command.advance_tab = 0;
1013 command.clear = 0;
1014 command.emit_token = token::none;
1015 return command;
1016 }
1017
1032 template<typename First, typename... Args>
1033 constexpr command_type& _add(state_type from, char c, state_type to, First first, Args const&...args) noexcept
1034 {
1035 auto& command = _add(from, c, to, args...);
1036 if constexpr (std::is_same_v<First, token::kind_type>) {
1037 command.emit_token = first;
1038
1039 } else if constexpr (std::is_same_v<First, advance_tag>) {
1040 command.advance = 1;
1041 if (c == '\n' or c == '\v' or c == '\f') {
1042 command.advance_line = 1;
1043 } else if (c == '\t') {
1044 command.advance_tab = 1;
1045 }
1046
1047 } else if constexpr (std::is_same_v<First, clear_tag>) {
1048 command.clear = 1;
1049
1050 } else if constexpr (std::is_same_v<First, capture_tag>) {
1051 command.char_to_capture = c;
1052
1053 } else if constexpr (std::is_same_v<First, char>) {
1054 command.char_to_capture = first;
1055
1056 } else {
1058 }
1059
1060 return command;
1061 }
1062
1063 template<typename... Args>
1064 constexpr void add(state_type from, char c, state_type to, Args const&...args) noexcept
1065 {
1066 auto& command = _add(from, c, to, args...);
1067 hi_assert(not command.assigned, "Overwriting an already assigned state:char combination.");
1068 command.assigned = true;
1069 }
1070
1071 template<typename... Args>
1072 constexpr void add(state_type from, std::string_view str, state_type to, Args const&...args) noexcept
1073 {
1074 for (auto c : str) {
1075 auto& command = _add(from, c, to, args...);
1076 hi_assert(not command.assigned, "Overwriting an already assigned state:char combination.");
1077 command.assigned = true;
1078 }
1079 }
1080
1081 template<typename... Args>
1082 constexpr void add(state_type from, any_tag, state_type to, Args const&...args) noexcept
1083 {
1084 static_assert(not has_advance_tag_argument<Args...>(), "any should not advance");
1085
1086 for (uint8_t c = 0; c != 128; ++c) {
1087 hilet& command = _add(from, char_cast<char>(c), to, args...);
1088 hi_assert(not command.assigned, "any should be added first to a state");
1089 }
1090 }
1091
1092 template<typename... Args>
1093 constexpr void add(state_type from, excluding_tag const& exclusions, state_type to, Args const&...args) noexcept
1094 {
1095 for (uint8_t c = 0; c != 128; ++c) {
1096 if (not exclusions.contains(char_cast<char>(c))) {
1097 auto& command = _add(from, char_cast<char>(c), to, args...);
1098 hi_assert(not command.assigned, "Overwriting an already assigned state:char combination.");
1099 command.assigned = true;
1100 }
1101 }
1102 }
1103};
1104
1105} // namespace detail
1106
1107template<lexer_config Config>
1108constexpr auto lexer = detail::lexer<Config>();
1109
1110}} // namespace hi::v1
#define hi_static_no_default(...)
This part of the code should not be reachable, unless a programming bug.
Definition assert.hpp:323
#define hi_assert(expression,...)
Assert if expression is true.
Definition assert.hpp:199
#define hi_axiom(expression,...)
Specify an axiom; an expression that is true.
Definition assert.hpp:253
#define hilet
Invariant should be the default for variables.
Definition utility.hpp:23
DOXYGEN BUG.
Definition algorithm.hpp:13
geometry/margins.hpp
Definition cache.hpp:11
@ zero
The number was zero, and this means something in the current language.
Character encoder/decoder template.
Definition char_converter.hpp:83
Definition lexer.hpp:21
uint16_t escape_by_quote_doubling
Escaping quotes within a string may be done using quote doubling.
Definition lexer.hpp:31
uint16_t equal_is_ini_assignment
The equal '=' character is used for INI-like assignment.S.
Definition lexer.hpp:60
char digit_separator
The character used to separate groups of numbers.
Definition lexer.hpp:83
uint16_t colon_is_ini_assignment
The colon ':' character is used for INI-like assignment.
Definition lexer.hpp:71
uint16_t has_color_literal
The language has a literal color.
Definition lexer.hpp:37
uint16_t zero_starts_octal
A zero starts in octal number.
Definition lexer.hpp:27
A configurable lexical analyzer with unicode Annex #31 support.
Definition lexer.hpp:130
constexpr auto parse(std::string_view str) const noexcept
Parse a string of UTF-8 characters.
Definition lexer.hpp:690
constexpr iterator< It, ItEnd > parse(It first, ItEnd last) const noexcept
Parse a range of UTF-8 characters.
Definition lexer.hpp:680
Definition lexer.hpp:359
Definition token.hpp:15
T addressof(T... args)
T back_inserter(T... args)
T clear(T... args)
T find(T... args)
T move(T... args)
T push_back(T... args)