-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrwg2feat.py
151 lines (141 loc) · 5.79 KB
/
rwg2feat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import sys
import os
import xml.etree.ElementTree as ET
word_only = True
span_types = {
'direct_speech': 'direct',
'indirect_speech': 'indirect',
'free_indirect_speech': 'free_indirect',
'reported_speech': 'reported',
'direct_thought': 'direct',
'indirect_thought': 'indirect',
'free_indirect_thought': 'free_indirect',
'reported_thought': 'reported',
'direct_writing': 'direct',
'indirect_writing': 'indirect',
'free_indirect_writing': 'free_indirect',
'reported_writing': 'reported'
}
def document_features_and_labels(path, label_scheme='BE', split_sentences=True):
with open(path) as f:
s = f.read()
root = ET.fromstring(s)
text = ''
for node in root[1]:
if node.tail is not None:
text += node.tail
tokens = []
spans = []
label_set = set()
char2tok = []
for child in root:
if child.tag == 'AnnotationSet':
annotationSet = child.attrib.get('Name')
if annotationSet == 'PreProc_Anno':
for annotation in child:
label = annotation.attrib['Type']
start = int(annotation.attrib['StartNode'])
end = int(annotation.attrib['EndNode'])
if label == 'Token':
while len(char2tok) < end:
char2tok.append(-1)
for i in range(start, end):
char2tok[i] = len(tokens)
token_feats = [('word', text[start:end].lower())]
if not word_only:
for child in annotation:
if child.tag == 'Feature':
feature_name = child[0].text
feature_value = child[1].text
if feature_name == 'featsRF' and feature_value is not None:
for part in feature_value.split('.'):
token_feats.append((feature_name, part))
else:
token_feats.append((feature_name, feature_value))
tokens.append(token_feats)
# Tokens are out of order :/
# re-order them here
reordered_tokens = []
reordered_char2tok = []
last = -1
for c in char2tok:
if c != -1:
if c != last:
reordered_tokens.append(tokens[c])
reordered_char2tok.append(len(reordered_tokens) - 1)
else:
reordered_char2tok.append(-1)
last = c
tokens = reordered_tokens
char2tok = reordered_char2tok
spans = []
for child in root:
if child.tag == 'AnnotationSet':
annotationSet = child.attrib.get('Name')
if annotationSet == 'RW_Anno':
for annotation in child:
label = annotation.attrib['Type']
label_set.add(label)
if label not in span_types:
continue
start = int(annotation.attrib['StartNode'])
end = int(annotation.attrib['EndNode'])
while char2tok[start] == -1:
start += 1
end -= 1
while char2tok[end] == -1:
end -= 1
start_tok = char2tok[start]
end_tok = char2tok[end]+1
assert start_tok >= 0
assert end_tok > 0
spans.append((span_types[label], start_tok, end_tok))
labels = []
if label_scheme == 'BE':
for token in tokens:
labels.append({st: ' ' for st in span_types.values()})
for span_type, start, end in spans:
labels[start][span_type] = 'B'
if end < len(labels):
labels[end][span_type] = 'E'
if split_sentences:
sentences_tokens = []
sentences_labels = []
for child in root:
if child.tag == 'AnnotationSet':
annotationSet = child.attrib.get('Name')
if annotationSet == 'PreProc_Anno':
for annotation in child:
label = annotation.attrib['Type']
start = int(annotation.attrib['StartNode'])
end = int(annotation.attrib['EndNode'])
if label == 'Sentence':
while char2tok[start] == -1:
start += 1
end -= 1
while char2tok[end] == -1:
end -= 1
start_tok = char2tok[start]
end_tok = char2tok[end]+1
sentence_tokens = tokens[start_tok:end_tok]
sentence_labels = labels[start_tok:end_tok]
sentences_tokens.append(sentence_tokens)
sentences_labels.append(sentence_labels)
return sentences_tokens, sentences_labels
else:
return [tokens], [labels]
def corpus_feats_and_labels(path, label_scheme='BE'):
instance_feats = []
instance_labels = []
if path.endswith('.xml'):
i_f, i_l = document_features_and_labels(path)
instance_feats += i_f
instance_labels += i_l
for dirpath, dirnames, filenames in os.walk(path):
for filename in filenames:
if filename.endswith('.xml'):
fpath = os.path.join(dirpath, filename)
i_f, i_l = document_features_and_labels(fpath)
instance_feats += i_f
instance_labels += i_l
return instance_feats, instance_labels