-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfix_numbering.py
36 lines (35 loc) · 1.37 KB
/
fix_numbering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import sys
import pyconll
corpus = pyconll.load_from_file(sys.argv[1])
with open(sys.argv[2],'w') as fixed:
for sentence in corpus:
print(sentence.id)
if sentence.meta_present('newdoc'):
print(sentence.meta_value("newdoc"))
fixed.write('# newdoc = %s\n' % sentence.meta_value('newdoc'))
if sentence.meta_present('comment'):
fixed.write('# comment = %s\n' % sentence.meta_value('comment'))
fixed.write('# sent_id = %s\n' % sentence.id)
if sentence.meta_present('speaker'):
fixed.write('# speaker = %s\n' % sentence.meta_value('speaker'))
fixed.write('# text = %s\n' % sentence.text.replace('_', ' '))
offset = 0
skip = False
mapping = {}
new_tokens = {}
i = 0
for token in sentence:
if '-' not in token.id: # fix this
i+=1
if int(token.id) != i:
mapping[token.id] = str(i)
token.id = str(i)
else:
head = int(token.id.split('-')[0])
tail = int(token.id.split('-')[1])
token.id = f"{str(i + 1)}-{str(i + 1 + tail - head)}"
for token in sentence:
if token.head in mapping:
token.head = mapping[token.head]
fixed.write("%s\n" % token.conll())
fixed.write('\n')