-
Notifications
You must be signed in to change notification settings - Fork 1
/
lexer.cs
592 lines (508 loc) · 18.2 KB
/
lexer.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
namespace CUP
{
using System;
using Symbol = CUP.runtime.Symbol;
/// <summary>This class implements a small scanner (aka lexical analyzer or lexer) for
/// the JavaCup specification. This scanner reads characters from standard
/// input (System.in) and returns integers corresponding to the terminal
/// number of the next Symbol. Once end of input is reached the EOF Symbol is
/// returned on every subsequent call.<p>
/// Symbols currently returned include: <pre>
/// Symbol Constant Returned Symbol Constant Returned
/// ------ ----------------- ------ -----------------
/// "package" PACKAGE "import" IMPORT
/// "code" CODE "action" ACTION
/// "parser" PARSER "terminal" TERMINAL
/// "non" NON "init" INIT
/// "scan" SCAN "with" WITH
/// "start" START "precedence" PRECEDENCE
/// "left" LEFT "right" RIGHT
/// "nonassoc" NONASSOC "%prec PRECENT_PREC
/// [ LBRACK ] RBRACK
/// ; SEMI
/// , COMMA * STAR
/// . DOT : COLON
/// ::= COLON_COLON_EQUALS | BAR
/// identifier ID {:...:} CODE_STRING
/// "nonterminal" NONTERMINAL
/// </pre>
/// All symbol constants are defined in sym.java which is generated by
/// JavaCup from parser.cup.<p>
///
/// In addition to the scanner proper (called first via init() then with
/// next_token() to get each Symbol) this class provides simple error and
/// warning routines and keeps a count of errors and warnings that is
/// publicly accessible.<p>
///
/// This class is "static" (i.e., it has only static members and methods).
/// *
/// </summary>
/// <version> last updated: 7/3/96
/// </version>
/// <author> Frank Flannery
///
/// </author>
public class lexer
{
private static System.IO.Stream _inStream = null;
/*-----------------------------------------------------------*/
/*--- Constructor(s) ----------------------------------------*/
/*-----------------------------------------------------------*/
/// <summary>The only constructor is private, so no instances can be created.
/// </summary>
public lexer()
{
}
/*-----------------------------------------------------------*/
/*--- Static (Class) Variables ------------------------------*/
/*-----------------------------------------------------------*/
/// <summary>First character of lookahead.
/// </summary>
protected internal static int next_char;
/// <summary>Second character of lookahead.
/// </summary>
protected internal static int next_char2;
/// <summary>Second character of lookahead.
/// </summary>
protected internal static int next_char3;
/// <summary>Second character of lookahead.
/// </summary>
protected internal static int next_char4;
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
/// <summary>EOF constant.
/// </summary>
protected internal static int EOF_CHAR = - 1;
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
/// <summary>Table of keywords. Keywords are initially treated as identifiers.
/// Just before they are returned we look them up in this table to see if
/// they match one of the keywords. The string of the name is the key here,
/// which indexes Integer objects holding the symbol number.
/// </summary>
protected internal static System.Collections.Hashtable keywords = new System.Collections.Hashtable(23);
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
/// <summary>Table of single character symbols. For ease of implementation, we
/// store all unambiguous single character Symbols in this table of Integer
/// objects keyed by Integer objects with the numerical value of the
/// appropriate char (currently Character objects have a bug which precludes
/// their use in tables).
/// </summary>
protected internal static System.Collections.Hashtable char_symbols = new System.Collections.Hashtable(11);
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
/// <summary>Current line number for use in error messages.
/// </summary>
protected internal static int current_line = 1;
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
/// <summary>Character position in current line.
/// </summary>
protected internal static int current_position = 1;
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
/// <summary>Character position in current line.
/// </summary>
protected internal static int absolute_position = 1;
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
/// <summary>Count of total errors detected so far.
/// </summary>
public static int error_count = 0;
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
/// <summary>Count of warnings issued so far
/// </summary>
public static int warning_count = 0;
public static void SetStream(System.IO.Stream inStream)
{
_inStream = inStream;
}
/*-----------------------------------------------------------*/
/*--- Static Methods ----------------------------------------*/
/*-----------------------------------------------------------*/
/// <summary>Initialize the scanner. This sets up the keywords and char_symbols
/// tables and reads the first two characters of lookahead.
/// </summary>
public static void init()
{
/* set up the keyword table */
SupportClass.PutElement(keywords, "package", sym.PACKAGE);
SupportClass.PutElement(keywords, "import", sym.IMPORT);
SupportClass.PutElement(keywords, "code", sym.CODE);
SupportClass.PutElement(keywords, "action", sym.ACTION);
SupportClass.PutElement(keywords, "parser", sym.PARSER);
SupportClass.PutElement(keywords, "terminal", sym.TERMINAL);
SupportClass.PutElement(keywords, "non", sym.NON);
SupportClass.PutElement(keywords, "nonterminal", sym.NONTERMINAL); // [CSA]
SupportClass.PutElement(keywords, "init", sym.INIT);
SupportClass.PutElement(keywords, "scan", sym.SCAN);
SupportClass.PutElement(keywords, "with", sym.WITH);
SupportClass.PutElement(keywords, "start", sym.START);
SupportClass.PutElement(keywords, "precedence", sym.PRECEDENCE);
SupportClass.PutElement(keywords, "left", sym.LEFT);
SupportClass.PutElement(keywords, "right", sym.RIGHT);
SupportClass.PutElement(keywords, "nonassoc", sym.NONASSOC);
/* set up the table of single character symbols */
SupportClass.PutElement(char_symbols, ';', sym.SEMI);
SupportClass.PutElement(char_symbols, ',', sym.COMMA);
SupportClass.PutElement(char_symbols, '*', sym.STAR);
SupportClass.PutElement(char_symbols, '.', sym.DOT);
SupportClass.PutElement(char_symbols, '|', sym.BAR);
SupportClass.PutElement(char_symbols, '[', sym.LBRACK);
SupportClass.PutElement(char_symbols, ']', sym.RBRACK);
/* read two characters of lookahead */
next_char = _inStream.ReadByte();
if (next_char == EOF_CHAR)
{
next_char2 = EOF_CHAR;
next_char3 = EOF_CHAR;
next_char4 = EOF_CHAR;
}
else
{
next_char2 = _inStream.ReadByte();
if (next_char2 == EOF_CHAR)
{
next_char3 = EOF_CHAR;
next_char4 = EOF_CHAR;
}
else
{
next_char3 = _inStream.ReadByte();
if (next_char3 == EOF_CHAR)
{
next_char4 = EOF_CHAR;
}
else
{
next_char4 = _inStream.ReadByte();
}
}
}
}
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
/// <summary>Advance the scanner one character in the input stream. This moves
/// next_char2 to next_char and then reads a new next_char2.
/// </summary>
protected internal static void advance()
{
int old_char;
old_char = next_char;
next_char = next_char2;
if (next_char == EOF_CHAR)
{
next_char2 = EOF_CHAR;
next_char3 = EOF_CHAR;
next_char4 = EOF_CHAR;
}
else
{
next_char2 = next_char3;
if (next_char2 == EOF_CHAR)
{
next_char3 = EOF_CHAR;
next_char4 = EOF_CHAR;
}
else
{
next_char3 = next_char4;
if (next_char3 == EOF_CHAR)
{
next_char4 = EOF_CHAR;
}
else
{
next_char4 = _inStream.ReadByte();
}
}
}
/* count this */
absolute_position++;
current_position++;
if (old_char == '\n' || (old_char == '\r' && next_char != '\n'))
{
current_line++;
current_position = 1;
}
}
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
/// <summary>Emit an error message. The message will be marked with both the
/// current line number and the position in the line. Error messages
/// are printed on standard error (System.err).
/// </summary>
/// <param name="message">the message to print.
///
/// </param>
public static void emit_error(string message)
{
System.Console.Error.WriteLine("Error at " + current_line + "(" + current_position + "): " + message);
error_count++;
}
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
/// <summary>Emit a warning message. The message will be marked with both the
/// current line number and the position in the line. Messages are
/// printed on standard error (System.err).
/// </summary>
/// <param name="message">the message to print.
///
/// </param>
public static void emit_warn(string message)
{
System.Console.Error.WriteLine("Warning at " + current_line + "(" + current_position + "): " + message);
warning_count++;
}
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
/// <summary>Determine if a character is ok to start an id.
/// </summary>
/// <param name="ch">the character in question.
///
/// </param>
protected internal static bool id_start_char(int ch)
{
/* allow for % in identifiers. a hack to allow my
%prec in. Should eventually make lex spec for this
frankf */
return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch == '_');
// later need to deal with non-8-bit chars here
}
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
/// <summary>Determine if a character is ok for the middle of an id.
/// </summary>
/// <param name="ch">the character in question.
///
/// </param>
protected internal static bool id_char(int ch)
{
return id_start_char(ch) || (ch >= '0' && ch <= '9');
}
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
/// <summary>Try to look up a single character symbol, returns -1 for not found.
/// </summary>
/// <param name="ch">the character in question.
/// </param>
protected internal static int find_single_char(int ch)
{
if(char_symbols.ContainsKey((char)ch))
{
return((int)char_symbols[(char)ch]);
}
else
{
return(-1);
}
}
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
/// <summary>Handle swallowing up a comment. Both old style C and new style C++
/// comments are handled.
/// </summary>
protected internal static void swallow_comment()
{
/* next_char == '/' at this point */
/* is it a traditional comment */
if (next_char2 == '*')
{
/* swallow the opener */
advance(); advance();
/* swallow the comment until end of comment or EOF */
for (; ; )
{
/* if its EOF we have an error */
if (next_char == EOF_CHAR)
{
emit_error("Specification file ends inside a comment");
return ;
}
/* if we can see the closer we are done */
if (next_char == '*' && next_char2 == '/')
{
advance();
advance();
return ;
}
/* otherwise swallow char and move on */
advance();
}
}
/* is its a new style comment */
if (next_char2 == '/')
{
/* swallow the opener */
advance(); advance();
/* swallow to '\n', '\r', '\f', or EOF */
while (next_char != '\n' && next_char != '\r' && next_char != '\f' && next_char != EOF_CHAR)
{
advance();
}
return ;
}
/* shouldn't get here, but... if we get here we have an error */
emit_error("Malformed comment in specification -- ignored");
advance();
}
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
/// <summary>Swallow up a code string. Code strings begin with "{:" and include
/// all characters up to the first occurrence of ":}" (there is no way to
/// include ":}" inside a code string). The routine returns a String
/// object suitable for return by the scanner.
/// </summary>
protected internal static Symbol do_code_string()
{
System.Text.StringBuilder result = new System.Text.StringBuilder();
/* at this point we have lookahead of "{:" -- swallow that */
advance(); advance();
/* save chars until we see ":}" */
while (!(next_char == ':' && next_char2 == '}'))
{
/* if we have run off the end issue a message and break out of loop */
if (next_char == EOF_CHAR)
{
emit_error("Specification file ends inside a code string");
break;
}
/* otherwise record the char and move on */
result.Append((char) next_char);
advance();
}
/* advance past the closer and build a return Symbol */
advance(); advance();
return new Symbol(sym.CODE_STRING, result.ToString());
}
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
/// <summary>Process an identifier. Identifiers begin with a letter, underscore,
/// or dollar sign, which is followed by zero or more letters, numbers,
/// underscores or dollar signs. This routine returns a String suitable
/// for return by the scanner.
/// </summary>
protected internal static Symbol do_id()
{
System.Text.StringBuilder result = new System.Text.StringBuilder();
System.String result_str;
System.Int32 keyword_num;
char[] buffer = new char[1];
/* next_char holds first character of id */
buffer[0] = (char) next_char;
result.Append(buffer, 0, 1);
advance();
/* collect up characters while they fit in id */
while (id_char(next_char))
{
buffer[0] = (char) next_char;
result.Append(buffer, 0, 1);
advance();
}
/* extract a string and try to look it up as a keyword */
result_str = result.ToString();
if(!keywords.ContainsKey(result_str))
{
return(new Symbol(sym.ID, result_str));
}
keyword_num = (System.Int32) keywords[result_str];
return new Symbol(keyword_num);
}
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
/// <summary>Return one Symbol. This is the main external interface to the scanner.
/// It consumes sufficient characters to determine the next input Symbol
/// and returns it. To help with debugging, this routine actually calls
/// real_next_token() which does the work. If you need to debug the
/// parser, this can be changed to call debug_next_token() which prints
/// a debugging message before returning the Symbol.
/// </summary>
public static Symbol next_token()
{
return real_next_token();
}
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
/// <summary>Debugging version of next_token(). This routine calls the real scanning
/// routine, prints a message on System.out indicating what the Symbol is,
/// then returns it.
/// </summary>
public static Symbol debug_next_token()
{
Symbol result = real_next_token();
System.Console.Out.WriteLine("# next_Symbol() => " + result.sym);
return result;
}
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
/// <summary>The actual routine to return one Symbol. This is normally called from
/// next_token(), but for debugging purposes can be called indirectly from
/// debug_next_token().
/// </summary>
protected internal static Symbol real_next_token()
{
int sym_num;
for (; ; )
{
/* look for white space */
if (next_char == ' ' || next_char == '\t' || next_char == '\n' || next_char == '\f' || next_char == '\r')
{
/* advance past it and try the next character */
advance();
continue;
}
/* look for a single character symbol */
sym_num = find_single_char(next_char);
if (sym_num != - 1)
{
/* found one -- advance past it and return a Symbol for it */
advance();
return new Symbol(sym_num);
}
/* look for : or ::= */
if (next_char == ':')
{
/* if we don't have a second ':' return COLON */
if (next_char2 != ':')
{
advance();
return new Symbol(sym.COLON);
}
/* move forward and look for the '=' */
advance();
if (next_char2 == '=')
{
advance(); advance();
return new Symbol(sym.COLON_COLON_EQUALS);
}
else
{
/* return just the colon (already consumed) */
return new Symbol(sym.COLON);
}
}
/* find a "%prec" string and return it. otherwise, a '%' was found,
which has no right being in the specification otherwise */
if (next_char == '%')
{
advance();
if ((next_char == 'p') && (next_char2 == 'r') && (next_char3 == 'e') && (next_char4 == 'c'))
{
advance();
advance();
advance();
advance();
return new Symbol(sym.PERCENT_PREC);
}
else
{
emit_error("Found extraneous percent sign");
}
}
/* look for a comment */
if (next_char == '/' && (next_char2 == '*' || next_char2 == '/'))
{
/* swallow then continue the scan */
swallow_comment();
continue;
}
/* look for start of code string */
if (next_char == '{' && next_char2 == ':')
return do_code_string();
/* look for an id or keyword */
if (id_start_char(next_char))
return do_id();
/* look for EOF */
if (next_char == EOF_CHAR)
return new Symbol(sym.EOF);
/* if we get here, we have an unrecognized character */
emit_warn("Unrecognized character '" + (char) next_char + "'(" + next_char + ") -- ignored");
/* advance past it */
advance();
}
}
/*-----------------------------------------------------------*/
}
}