-
Notifications
You must be signed in to change notification settings - Fork 4
/
tokenizer.py
143 lines (102 loc) · 3.62 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import enum
import re
from codepage import FUNCTIONS, INDICATORS
class TokenType(enum.Enum):
"""Class for all Tokens"""
NUMBER = "number"
STRING = "string"
FUNCTION = "function"
INDICATOR = "indicator"
class RegEx(enum.Enum):
"""Regex helping class for IsType"""
NUMBER = r"\d"
IGNORE_TOKEN = r" "
class IsType:
"""Class to check which token a string is"""
@staticmethod
def number(char: str):
return bool(re.match(RegEx.NUMBER.value, char))
@staticmethod
def function(char: str):
return char in FUNCTIONS
@staticmethod
def ignore_token(char: str):
return bool(re.match(RegEx.IGNORE_TOKEN.value, char))
@staticmethod
def string_delimiter(char: str):
return char == '"'
@staticmethod
def indicator(char: str):
return char in INDICATORS
class Token:
"""Token class"""
def __init__(self, name: TokenType, value):
self.name = name
self.value = value
def __repr__(self):
# return f"Token(name={self.name}, value={repr(self.value)})" # {self.value} for debugging
return f"Token({repr(self.value)})" # {self.value} for debugging
def tokenize(text: str) -> list[Token]:
"""
Tokenize text
"""
tokens = []
current_number = False
number = ""
current_string = False
string = ""
current_float = False
float_contents = ""
for char in text:
if not IsType.number(char):
if char == ".": # decimal point
float_contents += number + "."
current_float = True
current_number = False # so the next digits get added to number
number = ""
elif current_float:
float_contents += number
if float_contents.startswith("."):
float_contents = "0" + float_contents
if float_contents.endswith("."):
float_contents += "5"
tokens.append(Token(TokenType.NUMBER, float(float_contents)))
current_float = current_number = False
number = float_contents = ""
elif current_number:
tokens.append(Token(TokenType.NUMBER, int(number)))
current_number = False
number = ""
if not IsType.string_delimiter(char) and current_string:
string += char
elif IsType.function(char): # make sure it's a function and not a indicator
tokens.append(Token(TokenType.FUNCTION, char))
elif IsType.indicator(char): # special processing needed
tokens.append(Token(TokenType.INDICATOR, char))
elif IsType.number(char):
if not current_number:
current_number = True
number += char
else:
number += char
elif IsType.string_delimiter(char):
if not current_string:
current_string = True
else:
tokens.append(Token(TokenType.STRING, string))
current_string = False
string = ""
if current_float:
float_contents += number
if float_contents.startswith("."):
float_contents = "0" + float_contents
if float_contents.endswith("."):
float_contents += "5"
tokens.append(Token(TokenType.NUMBER, float(float_contents)))
elif current_number:
tokens.append(Token(TokenType.NUMBER, int(number)))
if current_string:
tokens.append(Token(TokenType.STRING, string))
return tokens
if __name__ == "__main__":
print(tokenize("+1 2"))