-
Notifications
You must be signed in to change notification settings - Fork 98
/
Copy pathtaggers.py
56 lines (46 loc) · 1.31 KB
/
taggers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from nltk.tag import NgramTagger, SequentialBackoffTagger
from nltk.corpus import wordnet, names
from nltk.probability import FreqDist
class QuadgramTagger(NgramTagger):
def __init__(self, *args, **kwargs):
NgramTagger.__init__(self, 4, *args, **kwargs)
class WordNetTagger(SequentialBackoffTagger):
'''
>>> wt = WordNetTagger()
>>> wt.tag(['food', 'is', 'great'])
[('food', 'NN'), ('is', 'VB'), ('great', 'JJ')]
'''
def __init__(self, *args, **kwargs):
SequentialBackoffTagger.__init__(self, *args, **kwargs)
self.wordnet_tag_map = {
'n': 'NN',
's': 'JJ',
'a': 'JJ',
'r': 'RB',
'v': 'VB'
}
def choose_tag(self, tokens, index, history):
word = tokens[index]
fd = FreqDist()
for synset in wordnet.synsets(word):
fd[synset.pos()] += 1
if not fd: return None
return self.wordnet_tag_map.get(fd.max())
class NamesTagger(SequentialBackoffTagger):
'''
>>> nt = NamesTagger()
>>> nt.tag(['Jacob'])
[('Jacob', 'NNP')]
'''
def __init__(self, *args, **kwargs):
SequentialBackoffTagger.__init__(self, *args, **kwargs)
self.name_set = set([n.lower() for n in names.words()])
def choose_tag(self, tokens, index, history):
word = tokens[index]
if word.lower() in self.name_set:
return 'NNP'
else:
return None
if __name__ == '__main__':
import doctest
doctest.testmod()