-
Notifications
You must be signed in to change notification settings - Fork 1
/
TokenFilter.py
183 lines (171 loc) · 7.31 KB
/
TokenFilter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/env python3
# -*- coding: utf-8, vim: expandtab:ts=4 -*-
from NLPInstanceFilter import *
from NLPInstance import *
from TokenProperty import *
from Token import *
class TokenFilter(NLPInstanceFilter):
"""
* Creates a new TokenFilter.
"""
def __init__(self):
"""
* The set of properties we should not see.
"""
self._forbiddenProperties = set()
"""
* A token needs to have at least one property value contained in this set (if {@link
* com.googlecode.whatswrong.TokenFilter#wholeWord} is true) or needs to have one value that contains a string in
* this set (otherwise).
"""
self._allowedStrings = set()
"""
* Should tokens be allowed only if they have a property value that equals one of the allowed strings or is it
* sufficient if one value contains one of the allowed strings.
"""
self._wholeWord = False
"""
* Are tokens allowed only if they have a property value that equals one of the allowed strings or is it sufficient
* if one value contains one of the allowed strings.
*
* @return true iff tokens are allowed based on exact matches with allowed strings, false otherwise.
"""
@property
def wholeWord(self):
return self._wholeWord
"""
* Should tokens be allowed only if they have a property value that equals one of the allowed strings or is it
* sufficient if one value contains one of the allowed strings.
*
* @param wholeWord true iff tokens should be allowed based on exact matches with allowed strings, false otherwise.
"""
@wholeWord.setter
def wholeWord(self, value):
self._wholeWord = value
"""
* Add a an allowed property value.
*
* @param string the allowed property value.
"""
def addAllowedString(self, string=str):
self._allowedStrings.add(string)
"""
* Remove all allowed strings. In this state the filter allows all tokens.
"""
def clearAllowedStrings(self):
self._allowedStrings.clear()
"""
* Add a property that is forbidden so that the corresponding values are removed from each token.
*
* @param name the name of the property to forbid.
"""
def addForbiddenProperty(self, name=str):
self._forbiddenProperties.add(TokenProperty(name))
"""
* Remove a property that is forbidden so that the corresponding values shown again.
*
* @param name the name of the property to show again.
"""
def removeForbiddenProperty(self, name=str):
p = TokenProperty(name)
if p in self._forbiddenProperties:
self._forbiddenProperties.remove(p)
"""
* Returns an unmodifiable view on the set of all allowed token properties.
*
* @return an unmodifiable view on the set of all allowed token properties.
"""
@property
def forbiddenProperties(self):
return self._forbiddenProperties
"""
* Filter a set of tokens by removing property values and individual tokens according to the set of allowed strings
* and forbidden properties.
*
* @param original the original set of tokens.
* @return the filtered set of tokens.
"""
def filterTokens(self, original):
result = []
for vertex in original:
copy = Token(vertex.index)
for property in vertex.getPropertyTypes():
if property not in self._forbiddenProperties:
copy.addProperty(property=property, value=vertex.getProperty(property))
result.append(copy)
return result
"""
* Filter an NLP instance by first filtering the tokens and then removing edges that have tokens which were filtered
* out.
*
* @param original the original nlp instance.
* @return the filtered nlp instance.
* @see NLPInstanceFilter#filter(NLPInstance)
"""
def filter(self, original = NLPInstance):
if len(self._allowedStrings) > 0:
# first filter out tokens not containing allowed strings
old2new = {}
new2old = {}
tokens = []
for t in original.tokens:
stopped = False
for property in t.getPropertyTypes():
if stopped:
break
prop = t.getProperty(property)
for allowed in self._allowedStrings:
if stopped:
break
# todo: this can surely be implemented in a nicer way (e.g. no reparsing of interval)
if property.name == "Index" and re.match("\d+-\d+", allowed):
split = allowed.split("-")
From = int(split[0])
to = int(split[1])
for i in range(From, to+1):
if(prop == str(i)):
newVertex = Token(len(tokens))
newVertex.merge(t)
tokens.append(newVertex)
old2new[t] = newVertex
new2old[newVertex] = t
stopped = True
break
else:
if self._wholeWord:
b = prop == allowed
else:
b= allowed in prop
if b:
newVertex = Token(len(tokens))
newVertex.merge(t)
tokens.append(newVertex)
old2new[t] = newVertex
new2old[newVertex] = t
stopped = True
break
# update edges and remove those that have vertices not in the new vertex set
edges = []
for e in original.getEdges():
if e.From not in old2new or e.To not in old2new:
continue
newFrom = old2new[e.From]
newTo = old2new[e.To]
edges.append((Edge(From=newFrom, To=newTo, label=e.label, note=e.note, Type=e.type,
renderType=e.renderType, description=e.description)))
# find new split points (have to be changed becouse instance has new token sequence)
splitPoints = []
newTokenIndex = 0
for oldSplitPoint in original.splitPoints:
newToken = tokens[newTokenIndex]
oldToken = new2old[newToken]
while newTokenIndex + 1 < len(tokens)and oldToken.index < oldSplitPoint:
newTokenIndex += 1
newToken = tokens[newTokenIndex]
oldToken = new2old[newToken]
return NLPInstance(tokens=self.filterTokens(tokens), edges=edges,
renderType=original.renderType, splitPoints=splitPoints)
else:
filteredTokens = self.filterTokens(original.tokens)
return NLPInstance(tokens=filteredTokens, edges=original.getEdges(),
renderType=original.renderType)