This repository has been archived by the owner on Dec 1, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 8
/
ctokenizer.cpp
596 lines (525 loc) · 20.8 KB
/
ctokenizer.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
// Copyright (C) 2018-2019 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.
// This example implements an approximation of a tokenizer for C.
#include <foonathan/lex/ascii.hpp> // utilities for ASCII matching
#include <foonathan/lex/tokenizer.hpp> // the main header for tokenization
// A namespace for the token grammar.
namespace C
{
namespace lex = foonathan::lex;
// Every token is a class inheriting from a special base that will specify the kind of token it is.
// The specification is then just an alias or inheritance of `lex::token_spec` passing it all the
// tokens. As some token types need to refer to the token specification, we have to alias it first,
// and thus need to pass forward declarations.
//
// Note that the order of tokens in the specification doesn't matter.
struct spec
: lex::token_spec<
struct whitespace, struct comment, struct identifier, struct int_literal,
struct float_literal, struct char_literal, struct string_literal, struct auto_, struct break_,
struct case_, struct char_, struct const_, struct continue_, struct default_, struct do_,
struct double_, struct else_, struct enum_, struct extern_, struct float_, struct for_,
struct goto_, struct if_, struct int_, struct long_, struct register_, struct return_,
struct short_, struct signed_, struct sizeof_, struct switch_, struct typedef_, struct union_,
struct void_, struct volatile_, struct while_, struct open_paren, struct close_paren,
struct open_curly, struct close_curly, struct open_square, struct close_square, struct add,
struct sub, struct mul, struct div, struct mod, struct and_, struct xor_, struct or_,
struct shift_right, struct shift_left, struct inc, struct dec, struct assign,
struct add_assign, struct sub_assign, struct mul_assign, struct div_assign, struct mod_assign,
struct and_assign, struct xor_assign, struct or_assign, struct shift_right_assign,
struct shift_left_assign, struct equal, struct not_equal, struct less, struct greater,
struct less_equal, struct greater_equal, struct logical_and, struct logical_or,
struct semicolon, struct comma, struct colon, struct dot, struct ellipsis, struct arrow,
struct tilde, struct exclamation_mark, struct question_mark>
{};
//=== comments and whitespace ===//
// `lex::rule_token` is the base class for tokens that follow a more complex grammar rule.
// It is a CRTP base class and will invoke a `rule()` function of the derived class to determine
// whether a given input matches. This function returns a combination of `lex::token_rule` that are
// PEG (parsing expression grammar) rules.
//
// We also inherit from `lex::whitespace_token`.
// This means that the token will be skipped when iterating over all tokens later on.
//
// Note that rule tokens will be tried in arbitrary order, so they have to be mutually exclusive.
struct whitespace : lex::rule_token<whitespace, spec>, lex::whitespace_token
{
static constexpr auto rule() noexcept
{
// Whitespace is just an arbitrary combination of ASCII whitespace characters.
// Note that the token is only considered matched if there is at least one character
// consumed, so in a top-level context `star()` and `plus()` are equivalent.
return lex::token_rule::star(lex::ascii::is_space);
}
// We can also give the token a name, this is only required if you call `.name()` later on.
static constexpr const char* name = "<whitespace>";
};
// Likewise, we can define a comment.
// It is also considered whitespace so will be skipped.
struct comment : lex::rule_token<comment, spec>, lex::whitespace_token
{
// As comments start with `/` they conflict with the `/` literal.
// So we have to tell the tokenizer that it has to check for a comment, after matching `/`.
static constexpr bool is_conflicting_literal(token_kind kind) noexcept
{
return kind == token_kind::of<div>();
}
static constexpr auto rule() noexcept
{
namespace tr = lex::token_rule;
// A C comment consists of `/*` followed by anything until `*/`.
auto c_comment = "/*" + tr::until("*/");
// A C++ comment consist of `//` followed by anything until a newline.
// The newline is not considered part of this token.
auto cpp_comment = "//" + tr::until_excluding(lex::ascii::is_newline);
// A comment token is either a C or a C++ token.
return c_comment / cpp_comment;
}
static constexpr const char* name = "<comment>";
};
//=== identifier ===//
// Identifiers are a special kind of `lex::rule_token`.
// They require some special interaction with keywords, so they inherit from `lex::identifier_token`
// instead. A token specification must contain at most one identifier token. Otherwise, they behave
// like `lex::rule_token`.
struct identifier : lex::identifier_token<identifier, spec>
{
static constexpr auto rule() noexcept
{
namespace tr = lex::token_rule;
// An identifier is a start character followed by zero-or-more rest characters.
return is_identifier_start + tr::star(is_identifier_rest);
}
static constexpr bool is_identifier_start(char c) noexcept
{
return lex::ascii::is_alpha(c) || c == '_';
}
static constexpr bool is_identifier_rest(char c) noexcept
{
return lex::ascii::is_alnum(c) || c == '_';
}
// Note that we do not need to give an identifier token a name.
};
//=== literals ===/
// As int and float literals are tightly coupled, it would be nice if they could be parsed by one
// function. Here I choose to implement it in `float_literal`, so `int_literal` doesn't need to have
// any code for matching. As such it inherits from `lex::null_token`. Those tokens will not be
// matched alone, but only by other rules.
struct int_literal : lex::null_token
{
static constexpr const char* name = "<int_literal>";
};
// Because `float_literal` matches both int and float literals,
// we have to write the matching code ourselves and cannot just provide a `rule()` function.
// So we inherit from `lex::basic_rule_token` instead, which is a similar CRTP class but it doesn't
// provide the matching implementation for us.
struct float_literal : lex::basic_rule_token<float_literal, spec>
{
// As floating point numbers can start with `.`, they conflict with the `.` token.
static constexpr bool is_conflicting_literal(token_kind kind) noexcept
{
return kind == token_kind::of<dot>();
}
// The rule for an integer suffix (i.e. the `u` in `0u`).
static constexpr auto integer_suffix() noexcept
{
namespace tr = lex::token_rule;
// Note the use of `tr::r`, `'u' / 'U'` wouldn't do the right thing.
// so we have to manually turn one of the characters into a rule with that function.
auto sign_suffix = tr::r('u') / 'U';
auto long_suffix = tr::r('l') / 'L';
auto sign_first_suffix = sign_suffix + tr::opt(long_suffix);
auto long_first_suffix = long_suffix + tr::opt(sign_suffix);
auto suffix = sign_first_suffix / long_first_suffix;
return tr::opt(suffix);
}
// The rule for a float suffix (i.e. the `f` in `0.f`).
static constexpr auto float_suffix() noexcept
{
namespace tr = lex::token_rule;
auto suffix = tr::r('f') / 'F' / 'l' / 'L';
return tr::opt(suffix);
}
// The rule for a float exponent (i.e. `2e-4`).
static constexpr auto float_exponent() noexcept
{
namespace tr = lex::token_rule;
return (tr::r('e') / 'E') + tr::opt(tr::r('+') / '-') + tr::plus(is_decimal_digit);
}
// This is the function that must determine the match.
// `str` points to the current position in the input, `end` one past the end.
// It will only be called if there is at least one character.
// It returns a `match_result` that describe which token was matched and how long it is.
static constexpr match_result try_match(const char* str, const char* end) noexcept
{
namespace tr = lex::token_rule;
// The rule for integer literals.
// Note that '0' is an octal literal.
auto hexadecimal = '0' + (tr::r('x') / 'X') + tr::plus(is_hexadecimal_digit);
auto octal = '0' + tr::star(is_octal_digit);
// The `!tr::r('0')` is a negative lookahead - only if the next character is not `0`,
// does the rule match.
auto decimal = !tr::r('0') + tr::plus(is_decimal_digit);
auto int_rule = (hexadecimal / octal / decimal) + integer_suffix();
// The rule for float literals.
auto float_with_fraction
= tr::padded(is_decimal_digit, '.', is_decimal_digit) + tr::opt(float_exponent());
auto float_without_fraction = tr::plus(is_decimal_digit) + float_exponent();
auto float_rule = (float_with_fraction / float_without_fraction) + float_suffix();
// In order to match the rules, we create a `lex::rule_matcher` giving it the input.
rule_matcher matcher(str, end);
if (matcher.match(float_rule))
// It matched the floating point rule.
// We finish parsing the float literal token if the remaining characters are not alpha
// numeric. If the remaining characters are alpha numeric, an error token will be
// matched instead. Note that we use a negative lookahead, so it will only look at the
// next characters and never put them into the token.
return matcher.finish(float_literal{}, !tr::r(lex::ascii::is_alnum) + !tr::r('_'));
else if (matcher.match(int_rule))
// It matched the integer rule token.
// As above, we finish matching by checking
return matcher.finish(int_literal{}, !tr::r(lex::ascii::is_alnum) + !tr::r('_'));
else
// It was neither an integer nor a float literal.
return unmatched();
}
static constexpr bool is_decimal_digit(char c) noexcept
{
return lex::ascii::is_digit(c);
}
static constexpr bool is_octal_digit(char c) noexcept
{
return c >= '0' && c <= '7';
}
static constexpr bool is_hexadecimal_digit(char c) noexcept
{
return lex::ascii::is_digit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
}
static constexpr const char* name = "<float_literal>";
};
// A character literal is a boring rule token again.
struct char_literal : lex::rule_token<char_literal, spec>
{
static constexpr auto rule() noexcept
{
namespace tr = lex::token_rule;
auto octal_escape = "\\0" + tr::at_most<2>(float_literal::is_octal_digit);
auto hexadecimal_escape = "\\x" + tr::at_most<2>(float_literal::is_hexadecimal_digit);
auto other_escape = '\\' + tr::any; // for simplicity, allow any character
auto c_char = octal_escape / hexadecimal_escape / other_escape / tr::any;
return tr::opt('L') + '\'' + c_char + '\'';
}
static constexpr const char* name = "<char_literal>";
};
// And so is a string literal.
struct string_literal : lex::rule_token<string_literal, spec>
{
static constexpr auto rule() noexcept
{
namespace tr = lex::token_rule;
// If we have a \, we skip the character afterwards as well.
// Otherwise, we only skip one character.
// That way we will never match an escaped " and can just use `tr::until()` below.
auto s_char = tr::if_then_else('\\', tr::any, tr::any);
return tr::opt('L') + '"' + tr::until('"', s_char);
}
static constexpr const char* name = "<string_literal>";
};
//=== keywords ===//
// All keywords simply inherit from `FOONATHAN_LEX_KEYWORD(Str)`.
// This macro expands to `lex::keyword_token<Str[0], Str[1], ...>`.
// The tokenizer will automatically check whether an `lex::identifier_token` is actually a keyword,
// so if you have a single keyword token you also need to have an identifier token.
// Note that we don't need to name the tokens, they will be named automatically.
struct auto_ : FOONATHAN_LEX_KEYWORD("auto")
{};
struct break_ : FOONATHAN_LEX_KEYWORD("break")
{};
struct case_ : FOONATHAN_LEX_KEYWORD("case")
{};
struct char_ : FOONATHAN_LEX_KEYWORD("char")
{};
struct const_ : FOONATHAN_LEX_KEYWORD("const")
{};
struct continue_ : FOONATHAN_LEX_KEYWORD("continue")
{};
struct default_ : FOONATHAN_LEX_KEYWORD("default")
{};
struct do_ : FOONATHAN_LEX_KEYWORD("do")
{};
struct double_ : FOONATHAN_LEX_KEYWORD("double")
{};
struct else_ : FOONATHAN_LEX_KEYWORD("else")
{};
struct enum_ : FOONATHAN_LEX_KEYWORD("enum")
{};
struct extern_ : FOONATHAN_LEX_KEYWORD("extern")
{};
struct float_ : FOONATHAN_LEX_KEYWORD("float")
{};
struct for_ : FOONATHAN_LEX_KEYWORD("for")
{};
struct goto_ : FOONATHAN_LEX_KEYWORD("goto")
{};
struct if_ : FOONATHAN_LEX_KEYWORD("if")
{};
struct int_ : FOONATHAN_LEX_KEYWORD("int")
{};
struct long_ : FOONATHAN_LEX_KEYWORD("long")
{};
struct register_ : FOONATHAN_LEX_KEYWORD("register")
{};
struct return_ : FOONATHAN_LEX_KEYWORD("return")
{};
struct short_ : FOONATHAN_LEX_KEYWORD("short")
{};
struct signed_ : FOONATHAN_LEX_KEYWORD("signed")
{};
struct sizeof_ : FOONATHAN_LEX_KEYWORD("sizeof")
{};
struct static_ : FOONATHAN_LEX_KEYWORD("static")
{};
struct struct_ : FOONATHAN_LEX_KEYWORD("struct")
{};
struct switch_ : FOONATHAN_LEX_KEYWORD("switch")
{};
struct typedef_ : FOONATHAN_LEX_KEYWORD("typedef")
{};
struct union_ : FOONATHAN_LEX_KEYWORD("union")
{};
struct unsigned_ : FOONATHAN_LEX_KEYWORD("unsigned")
{};
struct void_ : FOONATHAN_LEX_KEYWORD("void")
{};
struct volatile_ : FOONATHAN_LEX_KEYWORD("volatile")
{};
struct while_ : FOONATHAN_LEX_KEYWORD("while")
{};
//=== punctuation tokens ===//
// Punctuation tokens are similar to keyword tokens but do not need the special interaction with
// identifier tokens. So they inherit from `FOONATHAN_LEX_LITERAL(Str)` or
// `lex::literal_token<Str[0], Str[1], ...>`. The tokenizer will automatically match them after it
// tried all rule tokens unsuccessfully.
//
// Note that the order doesn't matter, it will match the longest token possible.
struct open_paren : FOONATHAN_LEX_LITERAL("(")
{};
struct close_paren : FOONATHAN_LEX_LITERAL(")")
{};
struct open_curly : FOONATHAN_LEX_LITERAL("{")
{};
struct close_curly : FOONATHAN_LEX_LITERAL("}")
{};
struct open_square : FOONATHAN_LEX_LITERAL("[")
{};
struct close_square : FOONATHAN_LEX_LITERAL("]")
{};
struct add : FOONATHAN_LEX_LITERAL("+")
{};
struct sub : FOONATHAN_LEX_LITERAL("-")
{};
struct mul : FOONATHAN_LEX_LITERAL("*")
{};
struct div : FOONATHAN_LEX_LITERAL("/")
{};
struct mod : FOONATHAN_LEX_LITERAL("%")
{};
struct and_ : FOONATHAN_LEX_LITERAL("&")
{};
struct xor_ : FOONATHAN_LEX_LITERAL("^")
{};
struct or_ : FOONATHAN_LEX_LITERAL("|")
{};
struct shift_right : FOONATHAN_LEX_LITERAL(">>")
{};
struct shift_left : FOONATHAN_LEX_LITERAL("<<")
{};
struct inc : FOONATHAN_LEX_LITERAL("++")
{};
struct dec : FOONATHAN_LEX_LITERAL("--")
{};
struct assign : FOONATHAN_LEX_LITERAL("=")
{};
struct add_assign : FOONATHAN_LEX_LITERAL("+=")
{};
struct sub_assign : FOONATHAN_LEX_LITERAL("-=")
{};
struct mul_assign : FOONATHAN_LEX_LITERAL("*=")
{};
struct div_assign : FOONATHAN_LEX_LITERAL("/=")
{};
struct mod_assign : FOONATHAN_LEX_LITERAL("%=")
{};
struct and_assign : FOONATHAN_LEX_LITERAL("&=")
{};
struct xor_assign : FOONATHAN_LEX_LITERAL("^=")
{};
struct or_assign : FOONATHAN_LEX_LITERAL("|=")
{};
struct shift_right_assign : FOONATHAN_LEX_LITERAL(">>=")
{};
struct shift_left_assign : FOONATHAN_LEX_LITERAL("<<=")
{};
struct equal : FOONATHAN_LEX_LITERAL("==")
{};
struct not_equal : FOONATHAN_LEX_LITERAL("!=")
{};
struct less : FOONATHAN_LEX_LITERAL("<")
{};
struct greater : FOONATHAN_LEX_LITERAL(">")
{};
struct less_equal : FOONATHAN_LEX_LITERAL("<=")
{};
struct greater_equal : FOONATHAN_LEX_LITERAL(">=")
{};
struct logical_and : FOONATHAN_LEX_LITERAL("&&")
{};
struct logical_or : FOONATHAN_LEX_LITERAL("||")
{};
struct semicolon : FOONATHAN_LEX_LITERAL(";")
{};
struct comma : FOONATHAN_LEX_LITERAL(",")
{};
struct colon : FOONATHAN_LEX_LITERAL(":")
{};
struct dot : FOONATHAN_LEX_LITERAL(".")
{};
struct ellipsis : FOONATHAN_LEX_LITERAL("...")
{};
struct arrow : FOONATHAN_LEX_LITERAL("->")
{};
struct tilde : FOONATHAN_LEX_LITERAL("~")
{};
struct exclamation_mark : FOONATHAN_LEX_LITERAL("!")
{};
struct question_mark : FOONATHAN_LEX_LITERAL("?")
{};
} // namespace C
#if !defined(FOONATHAN_LEX_TEST)
// A simple driver program that tokenizes the standard input.
# include <iostream>
# include <string>
int main()
{
namespace lex = foonathan::lex;
// We need to read the input a string.
// This is required because the tokens are just string views into that string.
std::string input(std::istreambuf_iterator<char>{std::cin}, std::istreambuf_iterator<char>{});
// Create a tokenizer for C passing it the string as input.
lex::tokenizer<C::spec> tokenizer(input.c_str(), input.size());
// As long as we still have characters left.
while (!tokenizer.is_done())
{
// Match and consume the current token.
auto token = tokenizer.get();
if (token.is_category<lex::is_literal_token>())
// If it is a literal token, just print the spelling.
std::cout << '`' << std::string(token.spelling().data(), token.spelling().size())
<< "`\n";
else
// Otherwise, print the name and the spelling.
std::cout << token.name() << ": `"
<< std::string(token.spelling().data(), token.spelling().size()) << "`\n";
}
}
#else
// Unit tests for the tokenizer.
# include <doctest.h>
# include <string>
# include <tokenize.hpp>
namespace
{
void check_token(lex::token<C::spec> token, lex::token_kind<C::spec> kind, const char* spelling)
{
REQUIRE(token.spelling() == spelling);
REQUIRE(token.kind() == kind);
}
} // namespace
TEST_CASE("whitespace and comments")
{
static constexpr const char array[] = R"(int int/* C comment */
int // C++ comment
int
)";
constexpr auto tokenizer = lex::tokenizer<C::spec>(array);
FOONATHAN_LEX_TEST_CONSTEXPR auto result = tokenize<C::spec>(tokenizer);
REQUIRE(result.size() == 4);
check_token(result[0], C::int_{}, "int");
check_token(result[1], C::int_{}, "int");
check_token(result[2], C::int_{}, "int");
check_token(result[3], C::int_{}, "int");
}
TEST_CASE("identifier and keywords")
{
static constexpr const char array[] = R"(
int
integer
Foo_bar123
__reserved
12anumber
)";
constexpr auto tokenizer = lex::tokenizer<C::spec>(array);
FOONATHAN_LEX_TEST_CONSTEXPR auto result = tokenize<C::spec>(tokenizer);
REQUIRE(result.size() == 6);
check_token(result[0], C::int_{}, "int");
check_token(result[1], C::identifier{}, "integer");
check_token(result[2], C::identifier{}, "Foo_bar123");
check_token(result[3], C::identifier{}, "__reserved");
check_token(result[4], lex::error_token{}, "12");
check_token(result[5], C::identifier{}, "anumber");
}
TEST_CASE("int and float literals")
{
static constexpr const char array[] = R"(
1234567890
0x1234567890ABCDEFabcdefl
0X42LU
01234567
0u
.123
1.23
123e45f
1.23E-4
1.
09
)";
constexpr auto tokenizer = lex::tokenizer<C::spec>(array);
FOONATHAN_LEX_TEST_CONSTEXPR auto result = tokenize<C::spec>(tokenizer);
REQUIRE(result.size() == 12);
check_token(result[0], C::int_literal{}, "1234567890");
check_token(result[1], C::int_literal{}, "0x1234567890ABCDEFabcdefl");
check_token(result[2], C::int_literal{}, "0X42LU");
check_token(result[3], C::int_literal{}, "01234567");
check_token(result[4], C::int_literal{}, "0u");
check_token(result[5], C::float_literal{}, ".123");
check_token(result[6], C::float_literal{}, "1.23");
check_token(result[7], C::float_literal{}, "123e45f");
check_token(result[8], C::float_literal{}, "1.23E-4");
check_token(result[9], C::float_literal{}, "1.");
check_token(result[10], lex::error_token{}, "0");
check_token(result[11], C::int_literal{}, "9");
}
TEST_CASE("string and char literals")
{
static constexpr const char array[] = R"(
'a'
'\n'
L'\''
'\0'
'\x42'
"hello world!"
L"hello \"world\""
)";
constexpr auto tokenizer = lex::tokenizer<C::spec>(array);
FOONATHAN_LEX_TEST_CONSTEXPR auto result = tokenize<C::spec>(tokenizer);
REQUIRE(result.size() == 7);
check_token(result[0], C::char_literal{}, R"('a')");
check_token(result[1], C::char_literal{}, R"('\n')");
check_token(result[2], C::char_literal{}, R"(L'\'')");
check_token(result[3], C::char_literal{}, R"('\0')");
check_token(result[4], C::char_literal{}, R"('\x42')");
check_token(result[5], C::string_literal{}, R"("hello world!")");
check_token(result[6], C::string_literal{}, R"(L"hello \"world\"")");
}
#endif