-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathdataio.py
163 lines (133 loc) · 5.87 KB
/
dataio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import collections
from _collections import defaultdict
import pandas as pd
import numpy as np
import spacy
import csv
nlp = spacy.load('en')
SpecialVocab = collections.namedtuple('SpecialVocab', ['sos', 'eos', 'unknown',
'padding'])
special_vocab = SpecialVocab(sos='SEQUENCE_START', eos='SEQUENCE_END',
unknown="UNK", padding='-PAD-')
def words_to_indices(seq, vocab_dict):
'''
@param seq: list of words/tokens
'''
word_indices = []
for w in seq:
if w in vocab_dict:
v = vocab_dict.get(w)
if v is None:
print("Got None for %s"%w)
word_indices.append(v)
else:
word_indices.append(vocab_dict.get(special_vocab.unknown))
print("Couldn't find %s"%w)
return word_indices
def build_vocab(sentsdf, min_freq=1):
vocab_dict = defaultdict(int)
for _, row in sentsdf.iterrows():
for w in row.words:
vocab_dict[w] += 1
#drop all words with less than min_freq
vocabdf = pd.DataFrame({'word': list(vocab_dict.keys()),
'freq': list(vocab_dict.values())})
vocabdf = vocabdf[vocabdf.freq >= min_freq]
#add special vocab, with padding at index 0
vocab_ = [special_vocab.padding, special_vocab.unknown,
special_vocab.eos, special_vocab.sos] + vocabdf.word.values.tolist()
return vocab_
def pad_seq(seq, max_len):
seq_len = len(seq)
if len(seq) > max_len:
seq = seq[:max_len]
seq_len = max_len
else:
seq = seq+[special_vocab.padding]*(max_len-seq_len)
return seq
def process_sequence(df, vocab_dict, max_len):
'''
1. Tokenize
2. Pad
3. Convert to wordindices
4. Convert to relative distances from entity1 and entity2 for each word.
'''
word_indices = []
ent1_dists = []
ent2_dists = []
for _, row in df.iterrows():
seq = row.sent
words = row.words
padded_words = [special_vocab.sos] + words + [special_vocab.eos]
padded_words = pad_seq(padded_words, max_len)
wi = words_to_indices(padded_words, vocab_dict)
e1_end = int(row.ent_1_end)#seq.index(row.ent_1)+len(row.ent_1)
e2_end = int(row.ent_2_end)#seq.index(row.ent_2)+len(row.ent_2)
newseq = seq[:e1_end].strip()+" entity_1_end "+ seq[e1_end:e2_end].strip() + \
" entity_2_end "+ seq[e2_end:].strip()
newseq_words = [tok.text for tok in nlp.tokenizer(newseq)]
newseq_words = [special_vocab.sos] + newseq_words + [special_vocab.eos]
i1 = newseq_words.index('entity_1_end') - 1#TODO: Use head of entity-phrase instead of rightmost word
i2 = newseq_words.index('entity_2_end') - 2
ent1_dist = [i-i1 for i in range(len(padded_words))]
ent2_dist = [i-i2 for i in range(len(padded_words))]
word_indices.append(wi)
ent1_dists.append(ent1_dist)
ent2_dists.append(ent2_dist)
word_indices, ent1_dists, ent2_dists = \
np.asarray(word_indices), np.asarray(ent1_dists), np.asarray(ent2_dists)
ent1_dists += max_len
ent2_dists += max_len
return word_indices, ent1_dists, ent2_dists
def read_semeval2010_data(filename):
data = {'rel':[], 'sent': [], 'ent_1':[], 'ent_2':[], 'words':[],
'ent_1_start':[], 'ent_2_start':[], 'ent_1_end':[], 'ent_2_end':[]}
etags = ['<e1>', '</e1>', '<e2>', '</e2>']
with open(filename, 'r') as rf:
for line in rf:
_, sent = line.split('\t')
rel = next(rf).strip().upper()
next(rf) #comment
next(rf)#blankline
e1 = sent[sent.index('<e1>')+4:sent.index('</e1>')]
e2 = sent[sent.index('<e2>')+4:sent.index('</e2>')]
e1_start = sent.index('<e1>') - 1
e2_start = sent.index('<e2>') - 1*4 - 1*5 - 1 #compensating for tag, and "
e1_end = sent.index('</e1>') - 1*4 - 1
e2_end = sent.index('</e2>') - 2*4 - 1*5 - 1
for tag_ in etags:
sent = sent.replace(tag_,"")
sent = sent.strip().lower()[1:-1]
words = [tok.text for tok in nlp.tokenizer(sent)]
data['sent'].append(sent)
data['ent_1'].append(e1)
data['ent_2'].append(e2)
data['rel'].append(rel)
data['words'].append(words)
data['ent_1_start'].append(e1_start)
data['ent_1_end'].append(e1_end)
data['ent_2_start'].append(e2_start)
data['ent_2_end'].append(e2_end)
df = pd.DataFrame.from_dict(data)
return df
def read_embeddings(embeddings_path, vocab_, init_scale=0.25,
dtype='float32', random_state=None):
if random_state is None:
random_state = np.random.RandomState(10)
vocab_vec = pd.read_csv(embeddings_path, header=None, skiprows=[0],
sep=' ', index_col=0, quoting=csv.QUOTE_NONE)
cols = ['col%d'%x for x in range(vocab_vec.shape[1])]
vocab_vec.columns = cols
# known_words = [w for w in vocab_ if w in vocab_vec.index]
# known_mat = vocab_vec.ix[known_words,:]
# known_mat.to_csv(embeddings_path+".aclaug", sep=' ', index_label='word')
print("Vocab Size: %d"%len(vocab_), flush=True)
unknown_words = [w for w in vocab_ if w not in vocab_vec.index]
print("adding %d unknown words..."%len(unknown_words), flush=True)
emb_dim = vocab_vec.shape[1]
rnd_mat = random_state.uniform(-init_scale, init_scale,
size=(len(unknown_words), emb_dim))
rnd_df = pd.DataFrame(rnd_mat, index=unknown_words, columns=cols)
vocab_vec = pd.concat([vocab_vec, rnd_df], axis=0)
embeddings_mat = vocab_vec.ix[vocab_,:]
return embeddings_mat