-
Notifications
You must be signed in to change notification settings - Fork 1
/
dag_to_jsonl.py
203 lines (167 loc) · 7.05 KB
/
dag_to_jsonl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# 1. read plain and merge somehow woth disamb
import glob
import json
import os
import sys
from argparse import ArgumentParser
import jsonlines as jsonlines
from tqdm import tqdm
from ktagger import KInterpretation, KToken, KText
"""
Reads DAGs from Morfeusz PolEval output (disambiguated or not).
"""
TOKENS = 'tokens'
YEARS = 'years'
SEGMENT = 'segment'
LEMMA = 'lemma'
SPACE_BEFORE = 'space_before'
TAG = 'tag'
START_POSITION = 'start_position'
END_POSITION = 'end_position'
DISAMB = 'disamb'
INTERPRETATIONS = 'interpretations'
START_OFFSET = 'start_offset'
END_OFFSET = 'end_offset'
def read_dag(path):
paragraphs = []
paragraph = {TOKENS: []}
years = None
# token_end_positions={0:0}
for line in open(path):
line = line[:-1]
if line == '': # end of paragraph
if paragraph[TOKENS]:
paragraphs.append(paragraph)
paragraph = {TOKENS: [], YEARS: years}
continue
fields = line.split('\t')
if len(fields) == 1:
years = fields[0][1:]
paragraph[YEARS] = years
continue
else:
try:
start_position, end_position, segment, lemma, tag, nps, disamb = fields
# disamb = disamb == 'disamb'
assert disamb in ['', 'disamb', 'disamb_manual']
except ValueError:
start_position, end_position, segment, lemma, tag, nps = fields
disamb = ''
start_position = int(start_position)
end_position = int(end_position)
nps = nps == 'nps'
space_before = not nps
last_token = paragraph[TOKENS][-1] if paragraph[TOKENS] else None
if last_token is not None and \
last_token[START_POSITION] == start_position and \
last_token[END_POSITION] == end_position:
assert last_token[SEGMENT] == segment
last_token[INTERPRETATIONS].append({LEMMA: lemma,
TAG: tag,
DISAMB: disamb})
else:
token = {START_POSITION: start_position,
END_POSITION: end_position,
SEGMENT: segment,
SPACE_BEFORE: space_before,
INTERPRETATIONS: [{LEMMA: lemma,
TAG: tag,
DISAMB: disamb}]}
paragraph[TOKENS].append(token)
return paragraphs
def is_disamb(token):
return any(['disamb' in interpretation[DISAMB] for interpretation in token[INTERPRETATIONS]])
def dag_offsets(paragraph):
text = original_text(paragraph)
start_offsets = {}
end_offsets = {0: 0}
for token in paragraph[TOKENS]:
start_position = token[START_POSITION]
end_position = token[END_POSITION]
# print(start_position, end_position)
try:
previous_end_offset = end_offsets[start_position]
if token[SPACE_BEFORE]:
previous_end_offset += 1
start_offsets[start_position] = previous_end_offset
token[START_OFFSET] = previous_end_offset
if text[previous_end_offset:previous_end_offset + len(token[SEGMENT])] == token[SEGMENT]:
end_offsets[end_position] = previous_end_offset + len(token[SEGMENT])
token[END_OFFSET] = previous_end_offset + len(token[SEGMENT])
else: # manually corrected tokenization introducing space before
# previous_offset += 1
# offsets[end_position] = previous_offset + len(token[SEGMENT])
print('OMITTING token with different space before', path,
text[previous_end_offset:previous_end_offset + len(token[SEGMENT])], token[SEGMENT],
file=sys.stderr)
token[START_OFFSET] = None
token[END_OFFSET] = None
except KeyError:
print('OMITTING node without incoming edges', path, token[SEGMENT], file=sys.stderr)
token[START_OFFSET] = None
token[END_OFFSET] = None
# print(offsets.values())
del end_offsets[0]
# return start_offsets, end_offsets
def original_text(paragraph):
""" Bierze pod uwagę pierwszą interpretację z danego węzła. Problem gdy analizator dodaje spacje, któ©ych nie ma."""
strings = []
last_position = 0
for token in paragraph[TOKENS]:
if token[START_POSITION] == last_position:
if token[SPACE_BEFORE]:
strings.append(' ')
strings.append(token[SEGMENT])
last_position = token[END_POSITION]
return ''.join(strings)
def convert_to_ktagger(path):
file_name = os.path.basename(path)
paragraphs = read_dag(path)
# print(path, len(paragraphs))
for paragraph_index, paragraph in enumerate(paragraphs):
if args.only_disamb:
tokens = [token for token in paragraph[TOKENS] if is_disamb(token)]
paragraph[TOKENS] = tokens
paragraph_id = f"{corpus}▁{file_name}▁{paragraph_index}"
ktext = KText(paragraph_id)
years = paragraph[YEARS]
year_feature = years[:2]
ktext.year = year_feature
text = original_text(paragraph)
ktext.text = text
dag_offsets(paragraph)
for token in paragraph[TOKENS]:
ktoken = KToken(token[SEGMENT], token[SPACE_BEFORE], token[START_OFFSET], token[END_OFFSET])
ktext.add_token(ktoken)
ktoken.start_position = token[START_POSITION]
ktoken.end_position = token[END_POSITION]
for interpretation in token[INTERPRETATIONS]:
disamb = 'disamb' in interpretation[DISAMB]
if args.only_disamb and not disamb:
continue
manual = 'manual' in interpretation[DISAMB]
kinterpretation = KInterpretation(interpretation[LEMMA], interpretation[TAG], disamb, manual)
ktoken.add_interpretation(kinterpretation)
assert text == ktext.infer_original_text()
ktext.check_offsets()
# print(ktext.save())
payload = json.loads(ktext.save2())
k = KText.load(payload)
# print(k)
# print(ktext.save())
# print(k.save())
assert ktext.save2() == k.save2()
# print(payload)
assert payload == ktext.save()
yield ktext
parser = ArgumentParser(description='Train')
parser.add_argument('path', help='path pattern to directory with DAG data')
parser.add_argument('output_path', help='path JSONL output')
parser.add_argument('corpus_name', help='corpus name')
parser.add_argument('--only_disamb', action='store_true', help='save only disamb versions of tokens and interpretations')
args = parser.parse_args()
corpus = args.corpus_name
with jsonlines.open(args.output_path, mode='w') as writer:
for path in tqdm(sorted(glob.glob(args.path))):
for ktext in convert_to_ktagger(path):
writer.write(ktext.save())