-
Notifications
You must be signed in to change notification settings - Fork 0
/
earley.py
executable file
·200 lines (173 loc) · 7.22 KB
/
earley.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#!/usr/bin/env python
import string
from collections import defaultdict
# Homework: Make this parser run in linear time on the string
# 'a+a*a-a(a*a+a), ... ,a+a*a-a(a*a+a)'
# from the language
# S->E|S,S
# E->E+E|E-E|E*E|(E)|-E|V
# V->a|b|c
# Grammars are represented in a simple fashion, no encapsulation in a class:
# - representation is efficient but not general, as tokens are single
# characters:
# - the grammar is a hash table mapping nonterminals to a tuple containing
# productions
# - each symbol (non-terminal and terminal) is a single ASCII symbol
# - non-terminals are uppercase; the rest are terminals
# - the start nonterminal is always called S
#
# Example: grammar S->S+S|(S)|a is represented with
#
# G={'S':('S+S','(S)','a')}
#
# Suggested edge representation:
# edge is a tuple (u:int,v:int,state:tuple),
# u, v: source and destiantion of the edge, given as indices into input
# string.
# state: production with a dot; see lecture notes for description of Earley
# algorithm
# state is a tuple (LHS:string,RHS,:string,position-of-dot:int)
# LHS: nonterminal on left-hand side of a production
# RHS: right-hand side of production
# position-of-dot: where the dot is in the RHS, ranges from 0 to len[RHS],
# inclusive
# Set to True if you want to generate graphviz file.
visualize = True
# The Earley algorithm
def parse(grammar, inp, incr):
ambiguous = False
if visualize:
# write a graph that can be visualized with graphviz.
# to view the graph, use the command $ dotty graph.dot
gviz = open('graph.dot','w')
gviz.write('digraph G {\nrankdir="LR";')
for i in range(0,len(inp)):
gviz.write(' %s -> %s [label = "%s",width=2];\n' % (i, i+1, inp[i]))
def drawEdge(e):
dot = e[2][2]
rhs = e[2][1][0:dot]+'.'+e[2][1][dot:]
gviz.write(' %s -> %s [label="%s --> %s", constraint=false];\n' % \
(e[0],e[1],e[2][0],rhs))
# The graph is partioned by (destination,completenessStatus) of edges.
# The graph is thus a dictionary with the key (dst,complete).
# Each key maps to a pair consisting of a list of edges as well as a set of
# edges. Both the list and the set keep the same set of edges. We use both
# so that we can modify the set of edges while iterating over it. While a
# list data structure can be modified during iteration, it cannot be tested
# for membership as fast as the set. Hence we use it together with a set.
# See the pattern below for illustration of how we use the set and list in
# tandem.
#
# List+Set pattern:
#
# iterate over a list but use a set to determine
# constant-time membership in the (changing) list
#
# lst = [1,2]
# s = set(lst)
# for v in lst:
# print v
# if (3 not in s):
# lst.append(3)
# s.add(3)
# defaultdict is like a dictionary that provides a default value when key
# is not present
graph = defaultdict(lambda: ([],set()))
# status of edge is either complete or inProgress
complete = 0
inProgress = 1
# return (list,set) of edges
def edgesIncomingTo(dst,status):
key = (dst,status)
return graph[key]
def addEdge(e):
"""Add edge to graph and worklist if not present in graph already.
Return True iff the edge was actually inserted
"""
incr(0)
# edge to key
(src,dst,(N,RHS,pos)) = e
status = complete if len(RHS)==pos else inProgress
(edgeList,edgeSet) = edgesIncomingTo(dst,status)
if e not in edgeSet:
edgeList.append(e)
edgeSet.add(e)
# add the edge to the Dotty graph
if visualize:
drawEdge(e)
return (True, False)
return (False, True)
# Add edge (0,0,(S -> . alpha)) to worklist, for all S -> alpha
for P in grammar['S']:
addEdge((0,0,('S',P,0)))
# for all tokens on the input
for j in xrange(0,len(inp)+1):
# skip in first iteration; we need to complete and predict the
# start nonterminal S before we start advancing over the input
if j > 0:
# ADVANCE across the next token:
# for each edge (i,j-1,N -> alpha . inp[j] beta)
# add edge (i,j,N -> alpha inp[j] . beta)
for (i,_j,(N,RHS,pos)) in edgesIncomingTo(j-1,inProgress)[0]:
incr(1)
assert _j==j-1
if pos < len(RHS) and RHS[pos]==inp[j-1]:
addEdge((i,j,(N,RHS,pos+1)))
# Repeat COMPLETE and PREDICT until no more edges can be added
edgeWasInserted = True
while edgeWasInserted:
edgeWasInserted = False
# COMPLETE productions
# for each edge (i,j,N -> alpha .)
# for each edge (k,i,M -> beta . N gamma)
# add edge (k,j,M -> beta N . gamma)
for (i,_j,(N,RHS,pos)) in edgesIncomingTo(j,complete)[0]:
assert _j==j and pos == len(RHS)
for (k,_i,(M,RHS2,pos2)) in edgesIncomingTo(i,inProgress)[0]:
incr(2)
if RHS2[pos2]==N:
(inserted, amb) = addEdge((k,j,(M,RHS2,pos2+1)))
edgeWasInserted = inserted or \
edgeWasInserted
# PREDICT what the parser is to see on input (move dots in edges
# that are in progress)
#
# for each edge (i,j,N -> alpha . M beta)
# for each production M -> gamma
# add edge (j,j,M -> . gamma)
for (i,_j,(N,RHS,pos)) in edgesIncomingTo(j,inProgress)[0]:
incr(3)
# non-terminals are upper case
if RHS[pos] in string.ascii_uppercase:
M = RHS[pos]
# prediction: for all rules D->alpha add edge (j,j,.alpha)
for RHS in grammar[M]:
incr(4)
(inserted, amb) = addEdge((j,j,(M,RHS,0)))
edgeWasInserted = inserted or \
edgeWasInserted
if visualize:
gviz.write('}')
gviz.close()
set_edges = {}
is_amb = False
for (dest,comp) in graph.keys():
if comp == 0:
for (src,dest,(N,RHS,pos)) in edgesIncomingTo(dest,comp)[0]:
if (src,dest) in set_edges:
if N in set_edges[(src,dest)]:
is_amb = True
else:
set_edges[(src,dest)][N] = RHS
else:
set_edges[(src,dest)] = {}
set_edges[(src,dest)][N] = RHS
#print set_edges
if is_amb:
print "Ambiguous"
# input has been parsed OK if and only if an edge (0,n,S -> alpha .) exists
for RHS in grammar['S']:
if (0,len(inp),('S',RHS,len(RHS))) in \
edgesIncomingTo(len(inp),complete)[1]:
return True
return False