-
Notifications
You must be signed in to change notification settings - Fork 17
/
hobbs.py
512 lines (445 loc) · 18.5 KB
/
hobbs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
"""
Implementation of Hobbs' algorithm for pronoun resolution.
Chris Ward, 2014
"""
import sys
import nltk
from nltk.corpus import names
from nltk import Tree
import Queue
# Labels for nominal heads
nominal_labels = ["NN", "NNS", "NNP", "NNPS", "PRP"]
def get_pos(tree, node):
""" Given a tree and a node, return the tree position
of the node.
"""
for pos in tree.treepositions():
if tree[pos] == node:
return pos
def get_dom_np(sents, pos):
""" Finds the position of the NP that immediately dominates
the pronoun.
Args:
sents: list of trees (or tree) to search
pos: the tree position of the pronoun to be resolved
Returns:
tree: the tree containing the pronoun
dom_pos: the position of the NP immediately dominating
the pronoun
"""
# start with the last tree in sents
tree = sents[-1]
# get the NP's position by removing the last element from
# the pronoun's
dom_pos = pos[:-1]
return tree, dom_pos
def walk_to_np_or_s(tree, pos):
""" Takes the tree being searched and the position from which
the walk up is started. Returns the position of the first NP
or S encountered and the path taken to get there from the
dominating NP. The path consists of a list of tree positions.
Args:
tree: the tree being searched
pos: the position from which the walk is started
Returns:
path: the path taken to get the an NP or S node
pos: the position of the first NP or S node encountered
"""
path = [pos]
still_looking = True
while still_looking:
# climb one level up the tree by removing the last element
# from the current tree position
pos = pos[:-1]
path.append(pos)
# if an NP or S node is encountered, return the path and pos
if "NP" in tree[pos].label() or tree[pos].label() == "S":
still_looking = False
return path, pos
def bft(tree):
""" Perform a breadth-first traversal of a tree.
Return the nodes in a list in level-order.
Args:
tree: a tree node
Returns:
lst: a list of tree nodes in left-to-right level-order
"""
lst = []
queue = Queue.Queue()
queue.put(tree)
while not queue.empty():
node = queue.get()
lst.append(node)
for child in node:
if isinstance(child, nltk.Tree):
queue.put(child)
return lst
def count_np_nodes(tree):
""" Function from class to count NP nodes.
"""
np_count = 0
if not isinstance(tree, nltk.Tree):
return 0
elif "NP" in tree.label() and tree.label() not in nominal_labels:
return 1 + sum(count_np_nodes(c) for c in tree)
else:
return sum(count_np_nodes(c) for c in tree)
def check_for_intervening_np(tree, pos, proposal, pro):
""" Check if subtree rooted at pos contains at least
three NPs, one of which is:
(i) not the proposal,
(ii) not the pronoun, and
(iii) greater than the proposal
Args:
tree: the tree being searched
pos: the position of the root subtree being searched
proposal: the position of the proposed NP antecedent
pro: the pronoun being resolved (string)
Returns:
True if there is an NP between the proposal and the pronoun
False otherwise
"""
bf = bft(tree[pos])
bf_pos = [get_pos(tree, node) for node in bf]
if count_np_nodes(tree[pos]) >= 3:
for node_pos in bf_pos:
if "NP" in tree[node_pos].label() \
and tree[node_pos].label() not in nominal_labels:
if node_pos != proposal and node_pos != get_pos(tree, pro):
if node_pos < proposal:
return True
return False
def traverse_left(tree, pos, path, pro, check=1):
""" Traverse all branches below pos to the left of path in a
left-to-right, breadth-first fashion. Returns the first potential
antecedent found.
If check is set to 1, propose as an antecedent any NP node
that is encountered which has an NP or S node between it and pos.
If check is set to 0, propose any NP node encountered as the antecedent.
Args:
tree: the tree being searched
pos: the position of the root of the subtree being searched
path: the path taked to get to pos
pro: the pronoun being resolved (string)
check: whether or not there must be an intervening NP
Returns:
tree: the tree containing the antecedent
p: the position of the proposed antecedent
"""
# get the results of breadth first search of the subtree
# iterate over them
breadth_first = bft(tree[pos])
# convert the treepositions of the subtree rooted at pos
# to their equivalents in the whole tree
bf_pos = [get_pos(tree, node) for node in breadth_first]
if check == 1:
for p in bf_pos:
if p<path[0] and p not in path:
if "NP" in tree[p].label() and match(tree, p, pro):
if check_for_intervening_np(tree, pos, p, pro) == True:
return tree, p
elif check == 0:
for p in bf_pos:
if p<path[0] and p not in path:
if "NP" in tree[p].label() and match(tree, p, pro):
return tree, p
return None, None
def traverse_right(tree, pos, path, pro):
""" Traverse all the branches of pos to the right of path p in a
left-to-right, breadth-first manner, but do not go below any NP
or S node encountered. Propose any NP node encountered as the
antecedent. Returns the first potential antecedent.
Args:
tree: the tree being searched
pos: the position of the root of the subtree being searched
path: the path taken to get to pos
pro: the pronoun being resolved (string)
Returns:
tree: the tree containing the antecedent
p: the position of the antecedent
"""
breadth_first = bft(tree[pos])
bf_pos = [get_pos(tree, node) for node in breadth_first]
for p in bf_pos:
if p>path[0] and p not in path:
if "NP" in tree[p].label() or tree[p].label() == "S":
if "NP" in tree[p].label() and tree[p].label() not in nominal_labels:
if match(tree, p, pro):
return tree, p
return None, None
def traverse_tree(tree, pro):
""" Traverse a tree in a left-to-right, breadth-first manner,
proposing any NP encountered as an antecedent. Returns the
tree and the position of the first possible antecedent.
Args:
tree: the tree being searched
pro: the pronoun being resolved (string)
"""
# Initialize a queue and enqueue the root of the tree
queue = Queue.Queue()
queue.put(tree)
while not queue.empty():
node = queue.get()
# if the node is an NP, return it as a potential antecedent
if "NP" in node.label() and match(tree, get_pos(tree,node), pro):
return tree, get_pos(tree, node)
for child in node:
if isinstance(child, nltk.Tree):
queue.put(child)
# if no antecedent is found, return None
return None, None
def match(tree, pos, pro):
""" Takes a proposed antecedent and checks whether it matches
the pronoun in number and gender
Args:
tree: the tree in which a potential antecedent has been found
pos: the position of the potential antecedent
pro: the pronoun being resolved (string)
Returns:
True if the antecedent and pronoun match
False otherwise
"""
if number_match(tree, pos, pro) and gender_match(tree, pos, pro):
return True
return False
def number_match(tree, pos, pro):
""" Takes a proposed antecedent and pronoun and checks whether
they match in number.
"""
m = {"NN": "singular",
"NNP": "singular",
"he": "singular",
"she": "singular",
"him": "singular",
"her": "singular",
"it": "singular",
"himself": "singular",
"herself": "singular",
"itself": "singular",
"NNS": "plural",
"NNPS": "plural",
"they": "plural",
"them": "plural",
"themselves": "plural",
"PRP": None}
# if the label of the nominal dominated by the proposed NP and
# the pronoun both map to the same number feature, they match
for c in tree[pos]:
if isinstance(c, nltk.Tree) and c.label() in nominal_labels:
if m[c.label()] == m[pro]:
return True
return False
def gender_match(tree, pos, pro):
""" Takes a proposed antecedent and pronoun and checks whether
they match in gender. Only checks for mismatches between singular
proper name antecedents and singular pronouns.
"""
male_names = (name.lower() for name in names.words('male.txt'))
female_names = (name.lower() for name in names.words('female.txt'))
male_pronouns = ["he", "him", "himself"]
female_pronouns = ["she", "her", "herself"]
neuter_pronouns = ["it", "itself"]
for c in tree[pos]:
if isinstance(c, nltk.Tree) and c.label() in nominal_labels:
# If the proposed antecedent is a recognized male name,
# but the pronoun being resolved is either female or
# neuter, they don't match
if c.leaves()[0].lower() in male_names:
if pro in female_pronouns:
return False
elif pro in neuter_pronouns:
return False
# If the proposed antecedent is a recognized female name,
# but the pronoun being resolved is either male or
# neuter, they don't match
elif c.leaves()[0].lower() in female_names:
if pro in male_pronouns:
return False
elif pro in neuter_pronouns:
return False
# If the proposed antecedent is a numeral, but the
# pronoun being resolved is not neuter, they don't match
elif c.leaves()[0].isdigit():
if pro in male_pronouns:
return False
elif pro in female_pronouns:
return False
return True
def hobbs(sents, pos):
""" The implementation of Hobbs' algorithm.
Args:
sents: list of sentences to be searched
pos: the position of the pronoun to be resolved
Returns:
proposal: a tuple containing the tree and position of the
proposed antecedent
"""
# The index of the most recent sentence in sents
sentence_id = len(sents)-1
# The number of sentences to be searched
num_sents = len(sents)
# Step 1: begin at the NP node immediately dominating the pronoun
tree, pos = get_dom_np(sents, pos)
# String representation of the pronoun to be resolved
pro = tree[pos].leaves()[0].lower()
# Step 2: Go up the tree to the first NP or S node encountered
path, pos = walk_to_np_or_s(tree, pos)
# Step 3: Traverse all branches below pos to the left of path
# left-to-right, breadth-first. Propose as an antecedent any NP
# node that is encountered which has an NP or S node between it and pos
proposal = traverse_left(tree, pos, path, pro)
while proposal == (None, None):
# Step 4: If pos is the highest S node in the sentence,
# traverse the surface parses of previous sentences in order
# of recency, the most recent first; each tree is traversed in
# a left-to-right, breadth-first manner, and when an NP node is
# encountered, it is proposed as an antecedent
if pos == ():
# go to the previous sentence
sentence_id -= 1
# if there are no more sentences, no antecedent found
if sentence_id < 0:
return None
# search new sentence
proposal = traverse_tree(sents[sentence_id], pro)
if proposal != (None, None):
return proposal
# Step 5: If pos is not the highest S in the sentence, from pos,
# go up the tree to the first NP or S node encountered.
path, pos = walk_to_np_or_s(tree, pos)
# Step 6: If pos is an NP node and if the path to pos did not pass
# through the nominal node that pos immediately dominates, propose pos
# as the antecedent.
if "NP" in tree[pos].label() and tree[pos].label() not in nominal_labels:
for c in tree[pos]:
if isinstance(c, nltk.Tree) and c.label() in nominal_labels:
if get_pos(tree, c) not in path and match(tree, pos, pro):
proposal = (tree, pos)
if proposal != (None, None):
return proposal
# Step 7: Traverse all branches below pos to the left of path,
# in a left-to-right, breadth-first manner. Propose any NP node
# encountered as the antecedent.
proposal = traverse_left(tree, pos, path, pro, check=0)
if proposal != (None, None):
return proposal
# Step 8: If pos is an S node, traverse all the branches of pos
# to the right of path in a left-to-right, breadth-forst manner, but
# do not go below any NP or S node encountered. Propose any NP node
# encountered as the antecedent.
if tree[pos].label() == "S":
proposal = traverse_right(tree, pos, path, pro)
if proposal != (None, None):
return proposal
return proposal
def resolve_reflexive(sents, pos):
""" Resolves reflexive pronouns by going to the first S
node above the NP dominating the pronoun and searching for
a matching antecedent. If none is found in the lowest S
containing the anaphor, then the sentence probably isn't
grammatical or the reflexive is being used as an intensifier.
"""
tree, pos = get_dom_np(sents, pos)
pro = tree[pos].leaves()[0].lower()
# local binding domain of a reflexive is the lowest clause
# containing the reflexive and a binding NP
path, pos = walk_to_s(tree, pos)
proposal = traverse_tree(tree, pro)
return proposal
def walk_to_s(tree, pos):
""" Takes the tree being searched and the position from which
the walk up is started. Returns the position of the first S
encountered and the path taken to get there from the
dominating NP. The path consists of a list of tree positions.
Args:
tree: the tree being searched
pos: the position from which the walk is started
Returns:
path: the path taken to get the an S node
pos: the position of the first S node encountered
"""
path = [pos]
still_looking = True
while still_looking:
# climb one level up the tree by removing the last element
# from the current tree position
pos = pos[:-1]
path.append(pos)
# if an S node is encountered, return the path and pos
if tree[pos].label() == "S":
still_looking = False
return path, pos
def demo():
tree1 = Tree.fromstring('(S (NP (NNP John) ) (VP (VBD said) (SBAR (-NONE- 0) \
(S (NP (PRP he) ) (VP (VBD likes) (NP (NNS dogs) ) ) ) ) ) )')
tree2 = Tree.fromstring('(S (NP (NNP John) ) (VP (VBD said) (SBAR (-NONE- 0) \
(S (NP (NNP Mary) ) (VP (VBD likes) (NP (PRP him) ) ) ) ) ) )')
tree3 = Tree.fromstring('(S (NP (NNP John)) (VP (VBD saw) (NP (DT a) \
(JJ flashy) (NN hat)) (PP (IN at) (NP (DT the) (NN store)))))')
tree4 = Tree.fromstring('(S (NP (PRP He)) (VP (VBD showed) (NP (PRP it)) \
(PP (IN to) (NP (NNP Terrence)))))')
tree5 = Tree.fromstring("(S(NP-SBJ (NNP Judge) (NNP Curry))\
(VP(VP(VBD ordered)(NP-1 (DT the) (NNS refunds))\
(S(NP-SBJ (-NONE- *-1))(VP (TO to) (VP (VB begin)\
(NP-TMP (NNP Feb.) (CD 1))))))(CC and)\
(VP(VBD said)(SBAR(IN that)(S(NP-SBJ (PRP he))(VP(MD would)\
(RB n't)(VP(VB entertain)(NP(NP (DT any) (NNS appeals))(CC or)\
(NP(NP(JJ other)(NNS attempts)(S(NP-SBJ (-NONE- *))(VP(TO to)\
(VP (VB block) (NP (PRP$ his) (NN order))))))(PP (IN by)\
(NP (NNP Commonwealth) (NNP Edison)))))))))))(. .))")
tree6 = Tree.fromstring('(S (NP (NNP John) ) (VP (VBD said) (SBAR (-NONE- 0) \
(S (NP (NNP Mary) ) (VP (VBD likes) (NP (PRP herself) ) ) ) ) ) )')
print "Sentence 1:"
print tree1
tree, pos = hobbs([tree1], (1,1,1,0,0))
print "Proposed antecedent for 'he':", tree[pos], '\n'
print "Sentence 2:"
print tree2
tree, pos = hobbs([tree2], (1,1,1,1,1,0))
print "Proposed antecedent for 'him':", tree[pos], '\n'
print "Sentence 3:"
print tree3
print "Sentence 4:"
print tree4
tree, pos = hobbs([tree3,tree4], (1,1,0))
print "Proposed antecedent for 'it':", tree[pos]
tree, pos = hobbs([tree3,tree4], (0,0))
print "Proposed antecedent for 'he':", tree[pos], '\n'
print "Sentence 5:"
print tree5
tree, pos = hobbs([tree5], (1,2,1,1,0,0))
print "Proposed antecedent for 'he':", tree[pos], '\n'
print "Sentence 6:"
print tree6
tree, pos = resolve_reflexive([tree6], (1,1,1,1,1,0))
print "Proposed antecedent for 'herself':", tree[pos], '\n'
def main(argv):
if len(sys.argv) == 2 and argv[1] == "demo":
demo()
else:
if len(sys.argv) > 3 or len(sys.argv) < 2:
print "Enter the file and the pronoun to resolve."
elif len(sys.argv) == 3:
p = ["He", "he", "Him", "him", "She", "she", "Her",
"her", "It", "it", "They", "they"]
r = ["Himself", "himself", "Herself", "herself",
"Itself", "itself", "Themselves", "themselves"]
fname = sys.argv[1]
pro = sys.argv[2]
with open(fname) as f:
sents = f.readlines()
trees = [Tree.fromstring(s) for s in sents]
pos = get_pos(trees[-1], pro)
pos = pos[:-1]
if pro in p:
tree, pos = hobbs(trees, pos)
for t in trees:
print t, '\n'
print "Proposed antecedent for '"+pro+"':", tree[pos]
elif pro in r:
tree, pos = resolve_reflexive(trees, pos)
for t in trees:
print t, '\n'
print "Proposed antecedent for '"+pro+"':", tree[pos]
if __name__ == "__main__":
main(sys.argv)