HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
lexer.hpp
1// Copyright Take Vos 2023.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "token.hpp"
8#include "../utility/module.hpp"
9#include "../unicode/module.hpp"
10#include "../char_maps/module.hpp"
11#include <ranges>
12#include <iterator>
13#include <cstdint>
14#include <string>
15#include <string_view>
16#include <format>
17#include <ostream>
18
19namespace hi { inline namespace v1 {
20
27 uint16_t zero_starts_octal : 1 = 0;
28
31 uint16_t escape_by_quote_doubling : 1 = 0;
32
37 uint16_t has_color_literal : 1 = 0;
38
39 uint16_t has_double_quote_string_literal : 1 = 0;
40 uint16_t has_single_quote_string_literal : 1 = 0;
41 uint16_t has_back_quote_string_literal : 1 = 0;
42
43 uint16_t has_double_slash_line_comment : 1 = 0;
44 uint16_t has_hash_line_comment : 1 = 0;
45 uint16_t has_semicolon_line_comment : 1 = 0;
46
47 uint16_t has_c_block_comment : 1 = 0;
48 uint16_t has_sgml_block_comment : 1 = 0;
49 uint16_t filter_white_space : 1 = 0;
50 uint16_t filter_comment : 1 = 0;
51
60 uint16_t equal_is_ini_assignment : 1 = 0;
61
71 uint16_t colon_is_ini_assignment : 1 = 0;
72
73 uint16_t minus_in_identifier : 1 = 0;
74
83 char digit_separator = '\0';
84
85 [[nodiscard]] constexpr static lexer_config c_style() noexcept
86 {
87 auto r = lexer_config{};
88 r.filter_white_space = 1;
89 r.zero_starts_octal = 1;
90 r.digit_separator = '\'';
91 r.has_double_quote_string_literal = 1;
92 r.has_single_quote_string_literal = 1;
93 r.has_double_slash_line_comment = 1;
94 r.has_c_block_comment = 1;
95 return r;
96 }
97
98 [[nodiscard]] constexpr static lexer_config css_style() noexcept
99 {
100 auto r = lexer_config{};
101 r.filter_white_space = 1;
102 r.filter_comment = 1;
103 r.has_color_literal = 1;
104 r.has_double_quote_string_literal = 1;
105 r.has_double_slash_line_comment = 1;
106 r.has_c_block_comment = 1;
107 r.minus_in_identifier = 1;
108 return r;
109 }
110
111 [[nodiscard]] constexpr static lexer_config ini_style() noexcept
112 {
113 auto r = lexer_config{};
114 r.filter_white_space = 1;
115 r.digit_separator = '_';
116 r.has_double_quote_string_literal = 1;
117 r.has_single_quote_string_literal = 1;
118 r.has_semicolon_line_comment = 1;
119 r.has_color_literal = 1;
120 r.equal_is_ini_assignment = 1;
121 return r;
122 }
123};
124
125namespace detail {
126
129template<lexer_config Config>
130class lexer {
131private:
132 enum class state_type : uint8_t {
133 idle,
134 zero,
135 bin_integer,
136 oct_integer,
137 dec_integer,
138 dec_integer_found_e,
139 dec_integer_found_E,
140 dec_integer_found_e_id,
141 dec_integer_found_E_id,
142 hex_integer,
143 dec_float,
144 dec_float_found_e,
145 dec_float_found_E,
146 dec_float_found_e_id,
147 dec_float_found_E_id,
148 hex_float,
149 dec_sign_exponent,
150 hex_sign_exponent,
151 dec_exponent,
152 hex_exponent,
153 dec_exponent_more,
154 hex_exponent_more,
155 color_literal,
156 sqstring_literal,
157 sqstring_literal_quote,
158 sqstring_literal_escape,
159 dqstring_literal,
160 dqstring_literal_quote,
161 dqstring_literal_escape,
162 bqstring_literal,
163 bqstring_literal_quote,
164 bqstring_literal_escape,
165 line_comment,
166 block_comment,
167 block_comment_found_star,
168 block_comment_found_dash,
169 block_comment_found_dash_dash,
170 block_comment_found_dash_dash_fin0,
171 found_colon,
172 found_dot,
173 found_eq,
174 found_hash,
175 found_lt,
176 found_lt_bang,
177 found_lt_bang_dash,
178 found_lt_eq,
179 found_slash,
180 ini_string,
181 white_space,
182 identifier,
183
184 _size
185 };
186
189 struct command_type {
192 state_type next_state = state_type::idle;
193
196 token::kind_type emit_token = token::none;
197
200 char char_to_capture = '\0';
201
204 uint8_t clear : 1 = 0;
205
208 uint8_t advance : 1 = 0;
209
212 uint8_t assigned : 1 = 0;
213
216 uint8_t advance_line : 1 = 0;
217
220 uint8_t advance_tab : 1 = 0;
221 };
222
223 struct clear_tag {};
224 struct any_tag {};
225 struct advance_tag {};
226 struct capture_tag {};
227
228 class excluding_tag {
229 public:
230 constexpr excluding_tag(std::string exclusions) noexcept : _exclusions(std::move(exclusions)) {}
231
232 [[nodiscard]] constexpr bool contains(char c) const noexcept
233 {
234 return _exclusions.find(c) != _exclusions.npos;
235 }
236
237 private:
238 std::string _exclusions;
239 };
240
243 constexpr static auto capture = capture_tag{};
244
247 constexpr static auto advance = advance_tag{};
248
251 constexpr static auto clear = clear_tag{};
252
255 constexpr static auto any = any_tag{};
256
257 template<size_t N>
258 [[nodiscard]] constexpr excluding_tag excluding(char const (&exclusions)[N]) noexcept
259 {
260 return excluding_tag{std::string(exclusions, N - 1)};
261 }
262
263 template<typename First, typename... Args>
264 [[nodiscard]] constexpr static bool _has_advance_tag_argument() noexcept
265 {
266 if constexpr (std::is_same_v<First, advance_tag>) {
267 return true;
268 } else if constexpr (sizeof...(Args) == 0) {
269 return false;
270 } else {
271 return _has_advance_tag_argument<Args...>();
272 }
273 }
274
275 template<typename... Args>
276 [[nodiscard]] constexpr static bool has_advance_tag_argument() noexcept
277 {
278 if constexpr (sizeof...(Args) == 0) {
279 return false;
280 } else {
281 return _has_advance_tag_argument<Args...>();
282 }
283 }
284
285public:
286 constexpr lexer() noexcept : _transition_table()
287 {
288 using enum state_type;
289
290 add(idle, '/', found_slash, advance, capture);
291 add(idle, '<', found_lt, advance, capture);
292 add(idle, '#', found_hash, advance, capture);
293 add(idle, '.', found_dot, advance, capture);
294 add(idle, '=', found_eq, advance, capture);
295 add(idle, ':', found_colon, advance, capture);
296
297 add(found_slash, any, idle, token::other);
298 add(found_lt, any, idle, token::other);
299 add(found_hash, any, idle, token::other);
300 add(found_dot, any, idle, token::other);
301 add(found_eq, any, idle, token::other);
302 add(found_colon, any, idle, token::other);
303
304 // Adds the starters "\"'`"
305 add_string_literals();
306
307 // Adds the starters "0123456789"
308 add_number_literals();
309
310 add_color_literal();
311 add_comments();
312 add_white_space();
313 add_identifier();
314 add_ini_assignment();
315
316 add(idle, "~!@$%^&*()-+[]{}\\|,>?", idle, token::other, capture, advance);
317
318 // All unused entries of the idle state are unexpected characters.
319 for (uint8_t i = 0; i != 128; ++i) {
320 auto& command = get_command(idle, char_cast<char>(i));
321 if (not command.assigned) {
322 command.assigned = 1;
323 command.advance = 1;
324 // If there are actual null characters in the string then nothing gets captured.
325 command.char_to_capture = char_cast<char>(i);
326 command.emit_token = token::error_unexepected_character;
327 command.next_state = idle;
328 }
329 }
330 }
331
332 [[nodiscard]] constexpr command_type& get_command(state_type from, char c) noexcept
333 {
334 return _transition_table[to_underlying(from) * 128_uz + char_cast<size_t>(c)];
335 }
336
337 [[nodiscard]] constexpr command_type const& get_command(state_type from, char c) const noexcept
338 {
339 return _transition_table[to_underlying(from) * 128_uz + char_cast<size_t>(c)];
340 }
341
342 template<typename It, std::sentinel_for<It> ItEnd>
343 struct iterator {
344 public:
346 using value_type = token;
347 using reference = value_type const &;
348 using pointer = value_type const *;
350
351 constexpr iterator(lexer const *lexer, It first, ItEnd last) noexcept :
352 _lexer(lexer), _first(first), _last(last), _it(first)
353 {
354 _cp = advance();
355 do {
356 _token.kind = parse_token();
357 } while (Config.filter_white_space and _token.kind == token::ws);
358 }
359
360 [[nodiscard]] constexpr reference operator*() const noexcept
361 {
362 return _token;
363 }
364
365 [[nodiscard]] constexpr pointer operator&() const noexcept
366 {
367 return std::addressof(_token);
368 }
369
370 constexpr iterator& operator++() noexcept
371 {
372 hi_axiom(*this != std::default_sentinel);
373 do {
374 _token.kind = parse_token();
375 } while (Config.filter_white_space and _token.kind == token::ws);
376 return *this;
377 }
378
379 constexpr void operator++(int) noexcept
380 {
381 ++(*this);
382 }
383
384 [[nodiscard]] constexpr bool operator==(std::default_sentinel_t) const noexcept
385 {
386 return _token.kind == token::none;
387 }
388
389 private:
390 lexer const *_lexer;
391 It _first;
392 ItEnd _last;
393 It _it;
394 char32_t _cp = 0;
395 token _token;
396 state_type _state = state_type::idle;
397 size_t _line_nr = 0;
398 size_t _column_nr = 0;
399
404 constexpr void clear() noexcept
405 {
406 _token.capture.clear();
407 }
408
413 constexpr void capture(char code_point) noexcept
414 {
415 _token.capture.push_back(code_point);
416 }
417
422 constexpr void capture(char32_t code_point) noexcept
423 {
424 hi_axiom(code_point < 0x7fff'ffff);
425
426 auto out_it = std::back_inserter(_token.capture);
427 char_map<"utf-8">{}.write(code_point, out_it);
428 }
429
430 constexpr void advance_counters() noexcept
431 {
432 if (_cp == '\n' or _cp == '\v' or _cp == '\f' or _cp == '\x85' or _cp == U'\u2028' or _cp == U'\u2029') {
433 ++_line_nr;
434 } else if (_cp == '\t') {
435 _column_nr /= 8;
436 ++_column_nr;
437 _column_nr *= 8;
438 } else {
439 ++_column_nr;
440 }
441 }
442
447 [[nodiscard]] constexpr char32_t advance() noexcept
448 {
449 if (_it == _last) {
450 return 0xffff'ffff;
451 }
452
453 hilet[code_point, valid] = char_map<"utf-8">{}.read(_it, _last);
454 return code_point;
455 }
456
457 [[nodiscard]] constexpr token::kind_type parse_token_unicode_identifier() noexcept
458 {
459 switch (ucd_get_lexical_class(_cp & 0x1f'ffff)) {
460 case unicode_lexical_class::id_start:
461 case unicode_lexical_class::id_continue:
462 capture(_cp);
463 advance_counters();
464 _cp = advance();
465 return token::none;
466
467 default:
468 if (Config.minus_in_identifier and _cp == '-') {
469 capture(_cp);
470 advance_counters();
471 _cp = advance();
472 return token::none;
473
474 } else {
475 _state = state_type::idle;
476 return token::id;
477 }
478 }
479 }
480
481 [[nodiscard]] constexpr token::kind_type parse_token_unicode_line_comment() noexcept
482 {
483 hilet cp_ = _cp & 0x1f'ffff;
484 if (cp_ == U'\u0085' or cp_ == U'\u2028' or cp_ == U'\u2029') {
485 _state = state_type::idle;
486 advance_counters();
487 _cp = advance();
488 return token::lcomment;
489
490 } else {
491 capture(_cp);
492 advance_counters();
493 _cp = advance();
494 return token::none;
495 }
496 }
497
498 [[nodiscard]] constexpr token::kind_type parse_token_unicode_white_space() noexcept
499 {
500 if (ucd_get_lexical_class(_cp & 0x1f'ffff) == unicode_lexical_class::white_space) {
501 capture(_cp);
502 advance_counters();
503 _cp = advance();
504 return token::none;
505
506 } else {
507 _state = state_type::idle;
508 return token::ws;
509 }
510 }
511
512 [[nodiscard]] constexpr token::kind_type parse_token_unicode_idle() noexcept
513 {
514 switch (ucd_get_lexical_class(_cp & 0x1f'ffff)) {
515 case unicode_lexical_class::id_start:
516 _state = state_type::identifier;
517 capture(_cp);
518 advance_counters();
519 _cp = advance();
520 return token::none;
521
522 case unicode_lexical_class::white_space:
523 _state = state_type::white_space;
524 capture(_cp);
525 advance_counters();
526 _cp = advance();
527 return token::none;
528
529 case unicode_lexical_class::syntax:
530 _state = state_type::idle;
531 capture(_cp);
532 advance_counters();
533 _cp = advance();
534 return token::other;
535
536 default:
537 capture(_cp);
538 advance_counters();
539 _cp = advance();
540 return token::error_unexepected_character;
541 }
542 }
543
544 [[nodiscard]] hi_no_inline constexpr token::kind_type parse_token_unicode() noexcept
545 {
546 using enum state_type;
547
548 // Unicode by-pass.
549 switch (_state) {
550 case idle:
551 return parse_token_unicode_idle();
552
553 case white_space:
554 return parse_token_unicode_white_space();
555
556 case line_comment:
557 return parse_token_unicode_line_comment();
558
559 case identifier:
560 return parse_token_unicode_identifier();
561
562 case dqstring_literal:
563 case sqstring_literal:
564 case bqstring_literal:
565 case block_comment:
566 capture(_cp);
567 advance_counters();
568 _cp = advance();
569 return token::none;
570
571 case ini_string:
572 // Line-feeds will terminate an ini-string.
573 if (_cp == U'\u0085' or _cp == U'\u2028' or _cp == U'\u2029') {
574 return token::istr;
575 } else {
576 capture(_cp);
577 advance_counters();
578 _cp = advance();
579 return token::none;
580 }
581
582 default:
583 // Most tokens are terminated when a non-ascii code-point is found.
584 // Terminate these tokens as if we reached end-of-file.
585 return process_command();
586 }
587 }
588
589 [[nodiscard]] constexpr token::kind_type process_command(char c = '\0') noexcept
590 {
591 hilet command = _lexer->get_command(_state, c);
592 _state = command.next_state;
593
594 if (command.clear) {
595 clear();
596 }
597
598 if (command.char_to_capture != '\0') {
599 capture(command.char_to_capture);
600 }
601
602 if (command.advance) {
603 if (command.advance_line) {
604 ++_line_nr;
605 _column_nr = 0;
606 } else if (command.advance_tab) {
607 _column_nr /= 8;
608 ++_column_nr;
609 _column_nr *= 8;
610 } else {
611 ++_column_nr;
612 }
613 _cp = advance();
614 }
615
616 return command.emit_token;
617 }
618
619 [[nodiscard]] constexpr token::kind_type parse_token() noexcept
620 {
621 _token.line_nr = _line_nr;
622 _token.column_nr = _column_nr;
623 clear();
624
625 while (_cp <= 0x7fff'ffff) {
626 if (_cp <= 0x7f) {
627 if (auto token_kind = process_command(char_cast<char>(_cp)); token_kind != token::none) {
628 return token_kind;
629 }
630
631 } else {
632 auto emit_token = parse_token_unicode();
633 if (emit_token != token::none) {
634 return emit_token;
635 }
636 }
637 }
638
639 // Handle trailing state changes at end-of-file.
640 while (_state != state_type::idle) {
641 if (auto token_kind = process_command(); token_kind != token::none) {
642 return token_kind;
643 }
644 }
645
646 // We have finished parsing and there was no token captured.
647 // For example when the end of file only contains white-space.
648 return token::none;
649 }
650 };
651
652 static_assert(std::movable<iterator<std::string::iterator, std::string::iterator>>);
653 static_assert(std::is_same_v<std::iterator_traits<iterator<std::string::iterator, std::string::iterator>>::value_type, token>);
654 static_assert(std::input_or_output_iterator<iterator<std::string::iterator, std::string::iterator>>);
655 static_assert(std::weakly_incrementable<iterator<std::string::iterator, std::string::iterator>>);
656
663 template<typename It, std::sentinel_for<It> ItEnd>
664 [[nodiscard]] constexpr iterator<It, ItEnd> parse(It first, ItEnd last) const noexcept
665 {
666 return iterator<It, ItEnd>{this, first, last};
667 }
668
674 [[nodiscard]] constexpr auto parse(std::string_view str) const noexcept
675 {
676 return parse(str.begin(), str.end());
677 }
678
679private:
683 using transition_table_type = std::array<command_type, to_underlying(state_type::_size) * 128>;
684
685 transition_table_type _transition_table;
686
687 constexpr void add_string_literal(
688 char c,
689 token::kind_type string_token,
690 state_type string_literal,
691 state_type string_literal_quote,
692 state_type string_literal_escape) noexcept
693 {
694 using enum state_type;
695
696 add(idle, c, string_literal, advance);
697 add(string_literal, any, idle, token::error_incomplete_string);
698 for (uint8_t i = 1; i != 128; ++i) {
699 if (char_cast<char>(i) != c and char_cast<char>(i) != '\\') {
700 add(string_literal, char_cast<char>(i), string_literal, advance, capture);
701 }
702 }
703
704 if constexpr (Config.escape_by_quote_doubling) {
705 // Don't capture the first quote.
706 add(string_literal, c, string_literal_quote, advance);
707 // If quote is not doubled, this is the end of the string.
708 add(string_literal_quote, any, idle, string_token);
709 // Capture one quote of a doubled quote.
710 add(string_literal_quote, c, string_literal, advance, capture);
711 } else {
712 // Quote ends the string.
713 add(string_literal, c, idle, advance, string_token);
714 }
715
716 // Make sure that any escaped character sequence stays inside the string literal.
717 add(string_literal, '\\', string_literal_escape, advance, capture);
718 add(string_literal_escape, any, idle, token::error_incomplete_string);
719 for (uint8_t i = 1; i != 128; ++i) {
720 add(string_literal_escape, char_cast<char>(i), string_literal, advance, capture);
721 }
722 }
723
724 constexpr void add_string_literals() noexcept
725 {
726 using enum state_type;
727
728 if constexpr (Config.has_single_quote_string_literal) {
729 add_string_literal('\'', token::sstr, sqstring_literal, sqstring_literal_quote, sqstring_literal_escape);
730 } else {
731 add(idle, '\'', idle, token::other, advance, capture);
732 }
733
734 if constexpr (Config.has_double_quote_string_literal) {
735 add_string_literal('"', token::dstr, dqstring_literal, dqstring_literal_quote, dqstring_literal_escape);
736 } else {
737 add(idle, '"', idle, token::other, advance, capture);
738 }
739
740 if constexpr (Config.has_back_quote_string_literal) {
741 add_string_literal('`', token::bstr, bqstring_literal, bqstring_literal_quote, bqstring_literal_escape);
742 } else {
743 add(idle, '`', idle, token::other, advance, capture);
744 }
745 }
746
747 constexpr void add_number_literals() noexcept
748 {
749 using enum state_type;
750
751 add(idle, "0", zero, advance, capture);
752 add(idle, "123456789", dec_integer, advance, capture);
753
754 add(zero, any, idle, token::integer);
755 add(zero, ".", dec_float, advance, capture);
756 add(zero, "bB", bin_integer, advance, capture);
757 add(zero, "oO", oct_integer, advance, capture);
758 add(zero, "dD", dec_integer, advance, capture);
759 add(zero, "xX", hex_integer, advance, capture);
760
761 if constexpr (Config.zero_starts_octal) {
762 add(zero, "01234567", oct_integer, advance, capture);
763 add(zero, "89", idle, token::error_invalid_digit);
764 } else {
765 add(zero, "0123456789", dec_integer, advance, capture);
766 }
767
768 // binary-integer
769 add(bin_integer, any, idle, token::integer);
770 add(bin_integer, "01", bin_integer, advance, capture);
771 add(bin_integer, "23456789", idle, token::error_invalid_digit);
772
773 // octal-integer
774 add(oct_integer, any, idle, token::integer);
775 add(oct_integer, "01234567", oct_integer, advance, capture);
776 add(oct_integer, "89", idle, token::error_invalid_digit);
777
778 // decimal-integer
779 add(dec_integer, any, idle, token::integer);
780 add(dec_integer, "0123456789", dec_integer, advance, capture);
781 add(dec_integer, ".", dec_float, advance, capture);
782 add(dec_integer, "e", dec_integer_found_e, advance);
783 add(dec_integer, "E", dec_integer_found_E, advance);
784 add(dec_integer_found_e, any, dec_integer_found_e_id, token::integer);
785 add(dec_integer_found_E, any, dec_integer_found_E_id, token::integer);
786 add(dec_integer_found_e_id, any, identifier, 'e');
787 add(dec_integer_found_E_id, any, identifier, 'E');
788 add(dec_integer_found_e, "+-0123456789", dec_sign_exponent, 'e');
789 add(dec_integer_found_E, "+-0123456789", dec_sign_exponent, 'E');
790
791 // hexadecimal-integer
792 add(hex_integer, any, idle, token::integer);
793 add(hex_integer, "0123456789abcdefABCDEF", hex_integer, advance, capture);
794 add(hex_integer, ".", hex_float, advance, capture);
795 add(hex_integer, "pP", hex_sign_exponent, advance, capture);
796
797 // decimal-float
798 add(found_dot, "0123456789eE", dec_float);
799 add(dec_float, any, idle, token::real);
800 add(dec_float, "0123456789", dec_float, advance, capture);
801 add(dec_float, "e", dec_float_found_e, advance);
802 add(dec_float, "E", dec_float_found_E, advance);
803 add(dec_float_found_e, any, dec_float_found_e_id, token::real);
804 add(dec_float_found_E, any, dec_float_found_E_id, token::real);
805 add(dec_float_found_e_id, any, identifier, 'e');
806 add(dec_float_found_E_id, any, identifier, 'E');
807 add(dec_float_found_e, "+-0123456789", dec_sign_exponent, 'e');
808 add(dec_float_found_E, "+-0123456789", dec_sign_exponent, 'E');
809
810 add(dec_sign_exponent, any, idle, token::error_incomplete_exponent);
811 add(dec_sign_exponent, "0123456789", dec_exponent_more, advance, capture);
812 add(dec_sign_exponent, "+-", dec_exponent, advance, capture);
813 add(dec_exponent, any, idle, token::error_incomplete_exponent);
814 add(dec_exponent, "0123456789", dec_exponent_more, advance, capture);
815 add(dec_exponent_more, any, idle, token::real);
816 add(dec_exponent_more, "0123456789", dec_exponent_more, advance, capture);
817
818 // hexadecimal-float
819 add(hex_float, any, idle, token::real);
820 add(hex_float, "0123456789abcdefABCDEF", hex_float, advance, capture);
821 add(hex_float, "pP", hex_sign_exponent, advance, capture);
822 add(hex_sign_exponent, any, idle, token::error_incomplete_exponent);
823 add(hex_sign_exponent, "0123456789abcdefABCDEF", hex_exponent_more, advance, capture);
824 add(hex_sign_exponent, "+-", hex_exponent, advance, capture);
825 add(hex_exponent, any, idle, token::error_incomplete_exponent);
826 add(hex_exponent, "0123456789abcdefABCDEF", hex_exponent_more, advance, capture);
827 add(hex_exponent_more, any, idle, token::real);
828 add(hex_exponent_more, "0123456789abcdefABCDEF", hex_exponent_more, advance, capture);
829
830 if constexpr (Config.digit_separator != '\0') {
831 if constexpr (Config.zero_starts_octal) {
832 add(zero, Config.digit_separator, oct_integer, advance);
833 } else {
834 add(zero, Config.digit_separator, dec_integer, advance);
835 }
836 add(bin_integer, Config.digit_separator, bin_integer, advance);
837 add(oct_integer, Config.digit_separator, oct_integer, advance);
838 add(dec_integer, Config.digit_separator, dec_integer, advance);
839 add(hex_integer, Config.digit_separator, hex_integer, advance);
840 add(dec_float, Config.digit_separator, dec_integer, advance);
841 add(hex_float, Config.digit_separator, dec_integer, advance);
842 add(dec_exponent, Config.digit_separator, dec_integer, advance);
843 add(hex_exponent, Config.digit_separator, dec_integer, advance);
844 }
845 }
846
847 constexpr void add_color_literal() noexcept
848 {
849 using enum state_type;
850
851 if constexpr (Config.has_color_literal) {
852 add(found_hash, "0123456789abcdefABCDEF", color_literal, clear, capture, advance);
853 add(color_literal, any, idle, token::color);
854 add(color_literal, "0123456789abcdefABCDEF", color_literal, advance, capture);
855 }
856 }
857
858 constexpr void add_ini_assignment() noexcept
859 {
860 using enum state_type;
861
862 if constexpr (Config.equal_is_ini_assignment) {
863 // Ignore white-space
864 add(found_eq, " \t", found_eq, advance);
865 add(found_eq, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", ini_string, token::other);
866 }
867
868 if constexpr (Config.colon_is_ini_assignment) {
869 // Ignore white-space
870 add(found_colon, " \t", found_colon, advance);
871 add(found_colon, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", ini_string, token::other);
872 }
873
874 add(ini_string, any, idle, token::istr);
875 add(ini_string, excluding("\n\v\f\r\0"), ini_string, advance, capture);
876 add(ini_string, '\r', ini_string, advance);
877 }
878
879 constexpr void add_comments() noexcept
880 {
881 using enum state_type;
882
883 if constexpr (Config.has_double_slash_line_comment) {
884 add(found_slash, '/', line_comment, clear, advance);
885 }
886
887 if constexpr (Config.has_semicolon_line_comment) {
888 add(idle, ';', line_comment, advance);
889 } else {
890 add(idle, ';', idle, token::other, capture, advance);
891 }
892
893 if constexpr (Config.has_hash_line_comment) {
894 add(found_hash, excluding("\0"), line_comment, clear, advance, capture);
895 }
896
897 if constexpr (Config.has_c_block_comment) {
898 add(found_slash, '*', block_comment, advance, clear);
899 }
900
901 if constexpr (Config.has_sgml_block_comment) {
902 add(found_lt, '!', found_lt_bang, advance);
903 add(found_lt_bang, any, idle, token::error_after_lt_bang);
904 add(found_lt_bang, '-', found_lt_bang_dash, advance);
905 add(found_lt_bang_dash, any, idle, token::error_after_lt_bang);
906 add(found_lt_bang_dash, '-', block_comment, advance);
907 }
908
909 add(line_comment, any, idle, token::lcomment);
910 add(line_comment, excluding("\r\n\f\v\0"), line_comment, advance, capture);
911
912 add(line_comment, '\r', line_comment, advance);
913 add(line_comment, "\n\f\v", idle, advance, token::lcomment);
914
915 add(block_comment, any, idle, token::error_incomplete_comment);
916
917 static_assert(Config.has_c_block_comment == 0 or Config.has_sgml_block_comment == 0);
918
919 if constexpr (Config.has_c_block_comment) {
920 add(block_comment, excluding("*\0"), block_comment, advance, capture);
921 add(block_comment, '*', block_comment_found_star, advance);
922 add(block_comment_found_star, any, block_comment, '*');
923 add(block_comment_found_star, '/', idle, advance, token::bcomment);
924
925 } else if constexpr (Config.has_sgml_block_comment) {
926 add(block_comment, excluding("-\0"), block_comment, advance, capture);
927 add(block_comment, '-', block_comment_found_dash, advance);
928 add(block_comment_found_dash, any, block_comment, '-');
929 add(block_comment_found_dash, '-', block_comment_found_dash_dash, advance);
930 add(block_comment_found_dash_dash, any, block_comment_found_dash_dash_fin0, '-');
931 add(block_comment_found_dash_dash_fin0, any, block_comment, '-');
932 add(block_comment_found_dash_dash, '>', idle, advance, token::bcomment);
933 }
934 }
935
936 constexpr void add_white_space() noexcept
937 {
938 using enum state_type;
939
940 add(idle, '\r', white_space, advance);
941 add(idle, " \n\t\v\f", white_space, advance, capture);
942 add(white_space, any, idle, token::ws);
943 add(white_space, '\r', white_space, advance);
944 add(white_space, " \n\t\v\f", white_space, advance, capture);
945 }
946
947 constexpr void add_identifier() noexcept
948 {
949 using enum state_type;
950
951 add(idle, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", identifier, advance, capture);
952 add(identifier, any, idle, token::id);
953 add(identifier, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789", identifier, advance, capture);
954 if constexpr (Config.minus_in_identifier) {
955 add(identifier, '-', identifier, advance, capture);
956 }
957 }
958
959 constexpr command_type& _add(state_type from, char c, state_type to) noexcept
960 {
961 auto& command = get_command(from, c);
962 command.next_state = to;
963 command.char_to_capture = '\0';
964 command.advance = 0;
965 command.advance_line = 0;
966 command.advance_tab = 0;
967 command.clear = 0;
968 command.emit_token = token::none;
969 return command;
970 }
971
986 template<typename First, typename... Args>
987 constexpr command_type& _add(state_type from, char c, state_type to, First first, Args const&...args) noexcept
988 {
989 auto& command = _add(from, c, to, args...);
990 if constexpr (std::is_same_v<First, token::kind_type>) {
991 command.emit_token = first;
992
993 } else if constexpr (std::is_same_v<First, advance_tag>) {
994 command.advance = 1;
995 if (c == '\n' or c == '\v' or c == '\f') {
996 command.advance_line = 1;
997 } else if (c == '\t') {
998 command.advance_tab = 1;
999 }
1000
1001 } else if constexpr (std::is_same_v<First, clear_tag>) {
1002 command.clear = 1;
1003
1004 } else if constexpr (std::is_same_v<First, capture_tag>) {
1005 command.char_to_capture = c;
1006
1007 } else if constexpr (std::is_same_v<First, char>) {
1008 command.char_to_capture = first;
1009
1010 } else {
1012 }
1013
1014 return command;
1015 }
1016
1017 template<typename... Args>
1018 constexpr void add(state_type from, char c, state_type to, Args const&...args) noexcept
1019 {
1020 auto& command = _add(from, c, to, args...);
1021 hi_assert(not command.assigned, "Overwriting an already assigned state:char combination.");
1022 command.assigned = true;
1023 }
1024
1025 template<typename... Args>
1026 constexpr void add(state_type from, std::string_view str, state_type to, Args const&...args) noexcept
1027 {
1028 for (auto c : str) {
1029 auto& command = _add(from, c, to, args...);
1030 hi_assert(not command.assigned, "Overwriting an already assigned state:char combination.");
1031 command.assigned = true;
1032 }
1033 }
1034
1035 template<typename... Args>
1036 constexpr void add(state_type from, any_tag, state_type to, Args const&...args) noexcept
1037 {
1038 static_assert(not has_advance_tag_argument<Args...>(), "any should not advance");
1039
1040 for (uint8_t c = 0; c != 128; ++c) {
1041 hilet& command = _add(from, char_cast<char>(c), to, args...);
1042 hi_assert(not command.assigned, "any should be added first to a state");
1043 }
1044 }
1045
1046 template<typename... Args>
1047 constexpr void add(state_type from, excluding_tag const& exclusions, state_type to, Args const&...args) noexcept
1048 {
1049 for (uint8_t c = 0; c != 128; ++c) {
1050 if (not exclusions.contains(char_cast<char>(c))) {
1051 auto& command = _add(from, char_cast<char>(c), to, args...);
1052 hi_assert(not command.assigned, "Overwriting an already assigned state:char combination.");
1053 command.assigned = true;
1054 }
1055 }
1056 }
1057};
1058
1059} // namespace detail
1060
1061template<lexer_config Config>
1062constexpr auto lexer = detail::lexer<Config>();
1063
1064}} // namespace hi::v1
#define hi_static_no_default(...)
This part of the code should not be reachable, unless a programming bug.
Definition assert.hpp:323
#define hi_assert(expression,...)
Assert if expression is true.
Definition assert.hpp:199
#define hi_axiom(expression,...)
Specify an axiom; an expression that is true.
Definition assert.hpp:253
#define hilet
Invariant should be the default for variables.
Definition utility.hpp:23
DOXYGEN BUG.
Definition algorithm.hpp:13
geometry/margins.hpp
Definition cache.hpp:11
@ zero
The number was zero, and this means something in the current language.
Character encoder/decoder template.
Definition char_converter.hpp:83
Definition lexer.hpp:21
uint16_t escape_by_quote_doubling
Escaping quotes within a string may be done using quote doubling.
Definition lexer.hpp:31
uint16_t equal_is_ini_assignment
The equal '=' character is used for INI-like assignment.S.
Definition lexer.hpp:60
char digit_separator
The character used to separate groups of numbers.
Definition lexer.hpp:83
uint16_t colon_is_ini_assignment
The colon ':' character is used for INI-like assignment.
Definition lexer.hpp:71
uint16_t has_color_literal
The language has a literal color.
Definition lexer.hpp:37
uint16_t zero_starts_octal
A zero starts in octal number.
Definition lexer.hpp:27
A configurable lexical analyzer with unicode Annex #31 support.
Definition lexer.hpp:130
constexpr auto parse(std::string_view str) const noexcept
Parse a string of UTF-8 characters.
Definition lexer.hpp:674
constexpr iterator< It, ItEnd > parse(It first, ItEnd last) const noexcept
Parse a range of UTF-8 characters.
Definition lexer.hpp:664
Definition lexer.hpp:343
Definition token.hpp:15
T addressof(T... args)
T back_inserter(T... args)
T clear(T... args)
T find(T... args)
T move(T... args)
T push_back(T... args)