forked from justinmeza/lci
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizer.h
241 lines (228 loc) · 7.57 KB
/
tokenizer.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
/**
* Structures and functions for grouping lexemes into tokens. The tokenizer
* reads through an array of lexemes (generated by the lexer) and groups them
* into tokens based on their structure. In addition, some lexemes with
* semantic meaning (such as integers, floats, strings, and booleans) will have
* their values extracted and stored.
*
* \file tokenizer.h
*
* \author Justin J. Meza
*
* \date 2010-2012
*/
#ifndef __TOKENIZER_H__
#define __TOKENIZER_H__
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "lexer.h"
#include "error.h"
#undef DEBUG
/**
* Represents a token type. All of the token type names correspond to either
* the semantic type of token data or the lexemes which make up the particular
* token.
*
* \note Remember to update the keywords array (below) with the token image.
*/
typedef enum {
TT_INTEGER, /**< Integer literal. */
TT_FLOAT, /**< Decimal literal. */
TT_STRING, /**< String literal. */
TT_IDENTIFIER, /**< Identifier literal. */
TT_BOOLEAN, /**< Boolean literal. */
TT_IT, /**< \ref impvar "Implicit variable". */
TT_ITZLIEKA, /**< Inherited object declaration. */
TT_NOOB, /**< Nil keyword. */
TT_NUMBR, /**< Integer keyword. */
TT_NUMBAR, /**< Decimal keyword. */
TT_TROOF, /**< Boolean keyword. */
TT_YARN, /**< String keyword. */
TT_BUKKIT, /**< Array. */
TT_EOF, /**< End of file. */
TT_NEWLINE, /**< Newline. */
TT_HAI, /**< Beginning of main block. */
TT_KTHXBYE, /**< End of main block. */
TT_HASA, /**< Variable declaration. */
TT_ITZA, /**< Variable type initialization. */
TT_ITZ, /**< Variable value initialization. */
TT_RNOOB, /**< Deallocation. */
TT_R, /**< Assignment. */
TT_ANYR, /**< User-defined function argument separator. */
TT_AN, /**< Built-in function argument separator. */
TT_SUMOF, /**< Addition. */
TT_DIFFOF, /**< Subtraction. */
TT_PRODUKTOF, /**< Multiplication. */
TT_QUOSHUNTOF, /**< Division. */
TT_MODOF, /**< Modulo. */
TT_BIGGROF, /**< Greater than. */
TT_SMALLROF, /**< Less than. */
TT_BOTHOF, /**< Logical AND. */
TT_EITHEROF, /**< Logical OR. */
TT_WONOF, /**< Logical XOR. */
TT_NOT, /**< Logical NOT. */
TT_MKAY, /**< Infinite arity argument delimiter. */
TT_ALLOF, /**< Infinite arity logical AND. */
TT_ANYOF, /**< Infinite arity logical OR. */
TT_BOTHSAEM, /**< Equality. */
TT_DIFFRINT, /**< Inequality. */
TT_MAEK, /**< Cast. */
TT_A, /**< Cast target separator. */
TT_ISNOWA, /**< In-place cast. */
TT_VISIBLE, /**< Print. */
TT_SMOOSH, /**< String concatenation. */
TT_BANG, /**< Exclamation point (!) */
TT_GIMMEH, /**< Input. */
TT_ORLY, /**< Conditional. */
TT_YARLY, /**< True branch. */
TT_MEBBE, /**< Else branch. */
TT_NOWAI, /**< False branch. */
TT_OIC, /**< Conditional and switch delimiter. */
TT_WTF, /**< Switch. */
TT_OMG, /**< Case. */
TT_OMGWTF, /**< Default case. */
TT_GTFO, /**< Break or return without value. */
TT_IMINYR, /**< Loop beginning. */
TT_UPPIN, /**< Auto increment loop variable. */
TT_NERFIN, /**< Auto decrement loop variable. */
TT_YR, /**< Function name delimiter. */
TT_TIL, /**< Do until. */
TT_WILE, /**< Do while. */
TT_IMOUTTAYR, /**< Loop ending. */
TT_HOWIZ, /**< Function definition beginning. */
TT_IZ, /**< Function scope delimiter. */
TT_IFUSAYSO, /**< Function definition end. */
TT_FOUNDYR, /**< Return with value. */
TT_SRS, /**< Indirect variable access. */
TT_APOSTROPHEZ, /**< Array slot access ('Z). */
TT_OHAIIM, /**< Alternate array declaration. */
TT_IMLIEK, /**< Alternate inherited object declaration. */
TT_KTHX, /**< End of alternate array declaration. */
TT_ENDOFTOKENS /**< Sentinel end of this enum -- don't move it! */
} TokenType;
static const char *keywords[] = {
"", /* TT_INTEGER */
"", /* TT_FLOAT */
"", /* TT_STRING */
"", /* TT_IDENTIFIER */
"", /* TT_BOOLEAN */
"IT", /* TT_IT */
"ITZ LIEK A", /* TT_ITZLIEKA */
"NOOB", /* TT_NOOB */
"NUMBR", /* TT_NUMBR */
"NUMBAR", /* TT_NUMBAR */
"TROOF", /* TT_TROOF */
"YARN", /* TT_YARN */
"BUKKIT", /* TT_BUKKIT */
"", /* TT_EOF */
"", /* TT_NEWLINE */
"HAI", /* TT_HAI */
"KTHXBYE", /* TT_KTHXBYE */
"HAS A", /* TT_HASA */
"ITZ A", /* TT_ITZA */
"ITZ", /* TT_ITZ */
"R NOOB", /* TT_RNOOB */
"R", /* TT_R */
"AN YR", /* TT_ANYR */
"AN", /* TT_AN */
"SUM OF", /* TT_SUMOF */
"DIFF OF", /* TT_DIFFOF */
"PRODUKT OF", /* TT_PRODUKTOF */
"QUOSHUNT OF", /* TT_QUOSHUNTOF */
"MOD OF", /* TT_MODOF */
"BIGGR OF", /* TT_BIGGROF */
"SMALLR OF", /* TT_SMALLROF */
"BOTH OF", /* TT_BOTHOF */
"EITHER OF", /* TT_EITHEROF */
"WON OF", /* TT_WONOF */
"NOT", /* TT_NOT */
"MKAY", /* TT_MKAY */
"ALL OF", /* TT_ALLOF */
"ANY OF", /* TT_ANYOF */
"BOTH SAEM", /* TT_BOTHSAEM */
"DIFFRINT", /* TT_DIFFRINT */
"MAEK", /* TT_MAEK */
"A", /* TT_A */
"IS NOW A", /* TT_ISNOWA */
"VISIBLE", /* TT_VISIBLE */
"SMOOSH", /* TT_SMOOSH */
"!", /* TT_BANG */
"GIMMEH", /* TT_GIMMEH */
"O RLY?", /* TT_ORLY */
"YA RLY", /* TT_YARLY */
"MEBBE", /* TT_MEBBE */
"NO WAI", /* TT_NOWAI */
"OIC", /* TT_OIC */
"WTF?", /* TT_WTF */
"OMG", /* TT_OMG */
"OMGWTF", /* TT_OMGWTF */
"GTFO", /* TT_GTFO */
"IM IN YR", /* TT_IMINYR */
"UPPIN", /* TT_UPPIN */
"NERFIN", /* TT_NERFIN */
"YR", /* TT_YR */
"TIL", /* TT_TIL */
"WILE", /* TT_WILE */
"IM OUTTA YR", /* TT_IMOUTTAYR */
"HOW IZ", /* TT_HOWIZ */
"IZ", /* TT_IZ */
"IF U SAY SO", /* TT_IFUSAYSO */
"FOUND YR", /* TT_FOUNDYR */
"SRS", /* TT_SRS */
"'Z", /* TT_APOSTROPHEZ */
"O HAI IM", /* TT_OHAIIM */
"IM LIEK", /* TT_IMLIEK */
"KTHX", /* TT_KTHX */
"" /* TT_ENDOFTOKENS */
};
/**
* Stores token data with semantic meaning.
*/
typedef union {
long long i; /**< Integer data. */
float f; /**< Decimal data. */
} TokenData;
/**
* Stores a token type and any parsed values.
*/
typedef struct {
TokenType type; /**< The type of token. */
TokenData data; /**< The stored data of type \a type. */
char *image; /**< The characters that comprise the token. */
const char *fname; /**< The name of the file containing the token. */
unsigned int line; /**< The line number the token was on. */
} Token;
/**
* \name Utilities
*
* Functions for performing helper tasks.
*/
/**@{*/
int isInteger(const char *);
int isFloat(const char *);
int isString(const char *);
int isIdentifier(const char *);
Token *isKeyword(LexemeList *, unsigned int *);
/**@}*/
/**
* \name Token modifiers
*
* Functions for creating and deleting tokens.
*/
/**@{*/
Token *createToken(TokenType, const char *, const char *, unsigned int);
void deleteToken(Token *);
int addToken(Token ***, unsigned int *, Token*);
void deleteTokens(Token **);
unsigned int acceptLexemes(LexemeList *, unsigned int, const char *);
/**@}*/
/**
* \name Lexeme tokenizer
*
* Generates tokens from lexemes.
*/
/**@{*/
Token **tokenizeLexemes(LexemeList *);
/**@}*/
#endif /* __TOKENIZER_H__ */