forked from DataStories-UniPi/miniDB
-
Notifications
You must be signed in to change notification settings - Fork 1
/
lex.py
181 lines (161 loc) · 5.99 KB
/
lex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import sys
import enum
class Lexer:
def __init__(self, input):
self.source = input
self.curChar = '' # Current character in the string.
self.curPos = -1 # Current position in the string.
self.nextChar()
# Process the next character.
def nextChar(self):
self.curPos += 1
if self.curPos >= len(self.source):
self.curChar = '\0' # EOF
else:
self.curChar = self.source[self.curPos]
# Return the lookahead character.
def peek(self):
if self.curPos + 1 >= len(self.source):
return '\0'
return self.source[self.curPos + 1]
# Invalid token found, print error message and exit.
def abort(self, message):
sys.exit("Lexing error. " + message)
# Skip whitespace and newlines.
def skipWhitespace(self):
while self.curChar == ' ' or self.curChar == '\t' or self.curChar == '\r' or self.curChar == '\n':
self.nextChar()
# Skip comments in the code.
def skipComment(self):
if self.curChar == '-':
if self.peek() == '-':
while self.curChar != '\n':
self.nextChar()
# Return the next token.
def getToken(self):
self.skipComment()
self.skipWhitespace()
token = None
# Check the first character of this token to see if we can decide what it is.
# If it is a multiple character operator (e.g, >=), number, identifier, or keyword then we will process the rest.
if self.curChar == '*':
token = Token(self.curChar, TokenType.ASTERISK)
elif self.curChar == '\n':
token = Token (self.curChar, TokenType.NEWLINE)
elif self.curChar == ';':
token = Token(self.curChar, TokenType.SEMICOLON)
elif self.curChar == '.':
token = Token(self.curChar, TokenType.DOT)
elif self.curChar == '(':
token = Token(self.curChar, TokenType.LPARENS)
elif self.curChar == ')':
token = Token(self.curChar, TokenType.RPARENS)
elif self.curChar == ',':
token = Token(self.curChar, TokenType.COMMA)
elif self.curChar == '\0':
token = Token('', TokenType.EOF)
elif self.curChar == '>':
token = Token('>', TokenType.GT)
elif self.curChar == '<':
token = Token('<', TokenType.LT)
elif self.curChar == '=':
token = Token('=', TokenType.EQ)
# Detect if it's a string.
elif self.curChar == '\'':
# Get characters between quotations.
self.nextChar()
startPos = self.curPos
while self.curChar != '\'':
# Don't allow special characters in the string. No escape characters, newlines, tabs, or %.
if self.curChar == '\r' or self.curChar == '\n' or self.curChar == '\t' or self.curChar == '\\' or self.curChar == '%':
self.abort("Illegal character in string.")
self.nextChar()
tokText = self.source[startPos : self.curPos] # Get the substring.
token = Token(tokText, TokenType.STRING)
# Detect if it's a number.
elif self.curChar.isdigit():
# Leading character is a digit, so this must be a number.
# Get all consecutive digits and decimal if there is one.
startPos = self.curPos
while self.peek().isdigit():
self.nextChar()
if self.peek() == '.': # Decimal!
self.nextChar()
# Must have at least one digit after decimal.
if not self.peek().isdigit():
# Error!
self.abort("Illegal character in number.")
while self.peek().isdigit():
self.nextChar()
tokText = self.source[startPos : self.curPos + 1]
token = Token(tokText, TokenType.NUMBER)
elif self.curChar.isalpha():
# Leading character is a letter, so this must be an identifier or a keyword.
# Get all consecutive alpha numeric characters.
startPos = self.curPos
while self.peek().isalnum() or self.peek() == "_" or self.peek() == "-":
self.nextChar()
# Check if the token is in the list of keywords.
tokText = self.source[startPos : self.curPos + 1] # Get the substring.
keyword = Token.checkIfKeyword(tokText)
if keyword == None: # Identifier
token = Token(tokText, TokenType.IDENT)
else: # Keyword
token = Token(tokText, keyword)
else:
# Unknown token!
self.abort("Unknown token: " + self.curChar)
self.nextChar()
return token
# Token contains the original text and the type of token.
class Token:
def __init__(self, tokenText, tokenKind):
self.text = tokenText # The token's actual text. Used for identifiers, strings, and numbers.
self.kind = tokenKind # The TokenType that this token is classified as.
def __str__(self):
return 'Text: ' + self.text + '\n Kind: ' + str(self.kind) + '\n\n'
@staticmethod
def checkIfKeyword(tokenText):
for kind in TokenType:
if kind.name == tokenText and kind.value >= 100 and kind.value < 200:
return kind
return None
# TokenType is our enum for all the types of tokens.
class TokenType(enum.Enum):
EOF = -1
NUMBER = 1
IDENT = 2
STRING = 3
SEMICOLON = 4
COMMA = 5
LPARENS = 6
RPARENS = 7
DOT = 8
NEWLINE = 9
# Keywords.
SELECT = 101
FROM = 102
WHERE = 103
UPDATE = 104
SET = 105
INSERT = 106
VALUES = 107
DROP = 108
DELETE = 109
DATABASE = 110
TABLE = 111
CREATE = 112
INT = 113
TEXT = 114
INTO = 115
INDEX = 116
ON = 117
PRIMARY = 118
KEY = 119
INNER = 120
JOIN = 121
# Operators.
EQ = 201
ASTERISK = 202
LT = 203
GT = 204