-
Notifications
You must be signed in to change notification settings - Fork 6
/
tokens.py
151 lines (114 loc) · 3.56 KB
/
tokens.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env python
"""
Source: https://github.com/bastings/parser.
Edited slightly.
"""
import os
from itertools import count
class Token:
pass
class XToken(Token):
"""Conll-X Token Representation"""
def __init__(self, tid, form, lemma, cpos, pos, feats,
head, deprel, phead, pdelrel):
self.id = int(tid)
self.form = form
self.lemma = lemma
self.cpos = cpos
self.pos = pos
self.feats = feats
self.head = int(head)
self.deprel = deprel
self.phead = phead
self.pdeprel = pdelrel
def __str__(self):
return '%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s' % (
self.id, self.form, self.lemma, self.cpos, self.pos, self.feats,
self.head, self.deprel, self.phead, self.pdeprel)
def __repr__(self):
return self.__str__()
class UToken(Token):
"""Conll-U Token Representation """
def __init__(self, tid, form, lemma, upos, xpos, feats,
head, deprel, deps, misc):
"""
Args:
tid: Word index, starting at 1; may be a range for multi-word tokens;
may be a decimal number for empty nodes.
form: word form or punctuation symbol.
lemma: lemma or stem of word form
upos: universal part-of-speech tag
xpos: language specific part-of-speech tag
feats: morphological features
head: head of current word (an ID or 0)
deprel: universal dependency relation to the HEAD (root iff HEAD = 0)
deps: enhanced dependency graph in the form of a list of head-deprel pairs
misc: any other annotation
"""
self.str_id = tid # Use this for printing the conll
self.id = int(float(tid)) # Use this for training TODO: what is this 10.1 business?
self.form = form
self.lemma = lemma
self.upos = upos
self.xpos = xpos
self.feats = feats
self.head = int(head)
self.deprel = deprel
self.deps = deps
self.misc = misc
def __str__(self):
return '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (
self.str_id, self.form, self.lemma, self.upos, self.xpos, self.feats,
self.head, self.deprel, self.deps, self.misc)
def __repr__(self):
return self.__str__()
@property
def pos(self):
return self.upos
def get_conllx_line(tid=1, form='_', lemma='_', cpos='_', pos='_',
feats='_', head='_', deprel='_', phead='_', pdelrel='_'):
return '%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t' % (
tid, form, lemma, cpos, pos, feats, head, deprel, phead, pdelrel)
def read_conllx(f):
tokens = []
for line in f:
line = line.strip()
if not line:
yield tokens
tokens = []
continue
if line[0] == "#":
continue
parts = line.split()
assert len(parts) == 10, "invalid conllx line"
tokens.append(XToken(*parts))
# possible last sentence without newline after
if len(tokens) > 0:
yield tokens
def read_conllu(f):
tokens = []
for line in f:
line = line.strip()
if not line:
yield tokens
tokens = []
continue
if line[0] == "#":
continue
parts = line.split()
assert len(parts) == 10, "invalid conllu line"
tokens.append(UToken(*parts))
# possible last sentence without newline after
if len(tokens) > 0:
yield tokens
def print_example(ex):
if "head" in ex.__dict__.keys():
r = ["%2d %12s %5s -> %2d (%s)" % (i, f, p, h, d) for i, f, p, h, d in zip(
count(start=1), ex.form, ex.pos, ex.head, ex.deprel)]
else:
r = ["%2d %12s %5s -> ? ?" % (i, f, p) for i, f, p in zip(
count(start=1), ex.form, ex.pos)]
print("\n".join(r))
print()
if __name__ == '__main__':
pass