-
Notifications
You must be signed in to change notification settings - Fork 4
/
Datahelpers.py
165 lines (145 loc) · 8.44 KB
/
Datahelpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import numpy as np
import nltk
import itertools
class Datahelper(object):
def __init__(self, data_dir, limits):
self.data_dir = data_dir
self.train_data_path = data_dir + '/ned.train.es.txt'
self.dev_data_path = data_dir + '/ned.dev.es.txt'
self.test_data_path = data_dir + '/ned.test.es.txt'
self.train_feat_path = data_dir + '/ned.train.txt'
self.dev_feat_path = data_dir + '/ned.testa.txt'
self.test_feat_path = data_dir + '/ned.testb.txt'
self.limits = limits
word_data, pos_data, tag_data = self.get_all_data(self.train_data_path)
self.word2idx, self.idx2word = self.get_dict(word_data)
#self.word2idx, self.emb_matrix = self.emb()
self.pos2idx, self.idx2pos = self.get_dict(pos_data)
self.feat2idx, self.feat_num = self.get_feat_dict()
self.tag_1o_idx, self.tag_2o_idx, self.tag_3o_idx = self.get_mul_dict(tag_data)
self.idx_1o_tag = {self.tag_1o_idx[tag]: tag for tag in self.tag_1o_idx}
self.idx_2o_tag = {self.tag_2o_idx[tag]: tag for tag in self.tag_2o_idx}
self.idx_3o_tag = {self.tag_3o_idx[tag]: tag for tag in self.tag_3o_idx}
self.train_set = self.get_data(self.train_data_path, self.train_feat_path, self.word2idx, self.pos2idx,
self.tag_1o_idx, self.tag_3o_idx, self.feat2idx)
self.dev_set = self.get_data(self.dev_data_path, self.dev_feat_path, self.word2idx, self.pos2idx,
self.tag_1o_idx, self.tag_3o_idx,self.feat2idx)
self.test_set = self.get_data(self.test_data_path, self.test_feat_path, self.word2idx, self.pos2idx,
self.tag_1o_idx, self.tag_3o_idx, self.feat2idx)
def get_all_data(self, train_data_path):
data = open(train_data_path, 'r').read().strip().split('\n\n')
word_data = [[word.split(' ')[0].lower() for word in sentence.split('\n')] for sentence in data]
pos_data = [[word.split(' ')[1] for word in sentence.split('\n')] for sentence in data]
tag_data = [[word.split(' ')[2] for word in sentence.split('\n')] for sentence in data]
return word_data, pos_data, tag_data
def get_mul_dict(self, data):
idx_1o = 1
tag_1o_idx = {'PADDING': 0}
for sent_tag in data:
for w_tag in sent_tag:
if w_tag not in tag_1o_idx:
tag_1o_idx[w_tag] = idx_1o
idx_1o += 1
# data_1o = [[tag_1o_idx[tag] for tag in tags]for tags in data]
tag_1o_idx['UNKNOWN'] = len(tag_1o_idx)
#tag_1o_idx = {'I-LOC': 9, 'B-ORG': 2, 'UNKNOWN': 10, 'O': 1, 'PADDING': 0,
# 'B-PER': 4, 'I-PER': 5, 'I-MISC': 7, 'B-MISC': 3, 'I-ORG': 8, 'B-LOC': 6}
idx_2o = 1
idx_3o = 1
tag_2o_idx = {}
tag_3o_idx = {}
tag_2o_idx[(0,0)] = 0
tag_3o_idx[(0,0,0)] = 0
data_1o = [[tag_1o_idx[tag] for tag in tags] for tags in data]
data_2o = [[(([0] + tags)[i - 1], ([0] + tags)[i]) for i in range(1, len([0] + tags))] for tags in data_1o]
data_3o = [[(([0] + tags + [0])[i - 1], ([0] + tags + [0])[i], ([0] + tags + [0])[i + 1]) for i in
range(1, len([0] + tags + [0]) - 1)] for tags in data_1o]
for sent_2o_tag in data_2o:
for w_tag in sent_2o_tag:
if w_tag not in tag_2o_idx:
tag_2o_idx[w_tag] = idx_2o
idx_2o += 1
for sent_3o_tag in data_3o:
for w_tag in sent_3o_tag:
if w_tag not in tag_3o_idx:
tag_3o_idx[w_tag] = idx_3o
idx_3o += 1
#tag_1o_idx['UNKNOWN'] = len(tag_1o_idx)
tag_2o_idx['UNKNOWN'] = len(tag_2o_idx)
tag_3o_idx['UNKNOWN'] = len(tag_3o_idx)
return tag_1o_idx, tag_2o_idx, tag_3o_idx
def get_dict(self, data):
word_freq = nltk.FreqDist(itertools.chain(*data))
data = sorted(word_freq.items(), key=lambda x: (x[1], x[0]), reverse=True)[:5000]
key_set = set()
for pair in data:
key_set.add(pair[0])
word2idx = {w:i+1 for i,w in enumerate(key_set)}
word2idx['UNKNOWN'] = len(word2idx) + 1
word2idx['PADDING'] = 0
idx2word = {word2idx[w]:w for w in word2idx}
return word2idx, idx2word
def get_feat_dict(self):
feat2idx = {'PADDING': 0, 'UNKNOWN': 1}
idx = 2
with open(self.data_dir + "/ned.train.txt") as f:
data = [[[w for w in row.strip().split(' ')] for row in sent.strip().split('\n')] for sent in
f.read().strip().split('\n\n')]
feat_num = len(data[0][0]) - 2
for sent in data:
for row in sent:
for w in row[1:-1]:
if w not in feat2idx:
feat2idx[w] = idx
idx += 1
return feat2idx, feat_num
def get_data(self, path, feat_path, word_dict, pos_dict, tag_1o_dict, tag_3o_dict, feat_dict):
data = open(path, 'r').read().strip().split('\n\n')
feat_data = open(feat_path, 'r').read().strip().split('\n\n')
if self.limits > 0:
data = data[:self.limits]
word_data = [[word.split(' ')[0].lower() for word in sentence.split('\n')] for sentence in data]
pos_data = [[word.split(' ')[1] for word in sentence.split('\n')] for sentence in data]
tag_1o_data = [[word.split(' ')[2] for word in sentence.split('\n')] for sentence in data]
word_data = [[word_dict[w] if w in word_dict else word_dict['UNKNOWN'] for w in sentence] for sentence in word_data]
pos_data = [[pos_dict[w] if w in pos_dict else pos_dict['UNKNOWN'] for w in sentence] for sentence in pos_data]
tag_1o_data = [[tag_1o_dict[w] if w in tag_1o_dict else tag_1o_dict['UNKNOWN'] for w in sentence] for sentence
in tag_1o_data]
tags_3o = [[(([0] + tag + [0])[i - 1], ([0] + tag + [0])[i], ([0] + tag + [0])[i + 1]) for i in
range(1, len([0] + tag + [0]) - 1)] for tag in tag_1o_data]
tag_3o_data = [[tag_3o_dict[w] if w in tag_3o_dict else tag_3o_dict['UNKNOWN'] for w in sentence] for sentence
in tags_3o]
feat_data = [[[w for w in row.strip().split(' ')[1:-1]]for row in sentence.strip().split('\n')] for sentence in feat_data]
feat_data = [[[feat_dict[w] if w in feat_dict else feat_dict['UNKNOWN'] for w in row]for row in sentence]for sentence in feat_data]
print ("feat_size = ", len(feat_data))
return word_data, pos_data, tag_3o_data, feat_data
def batch_iter(self, data, batch_size, shuffle, feat_num):
word_data, pos_data, tag_data, feat_data = data
data_size = len(word_data)
print ("datasize = ", data_size)
num_batches_per_epoch = int((data_size - 1) / batch_size) + 1
if shuffle:
shuffle_indices = np.random.permutation(np.arange(data_size))
word_data = np.array(word_data)[shuffle_indices]
pos_data = np.array(pos_data)[shuffle_indices]
tag_data = np.array(tag_data)[shuffle_indices]
feat_data = np.array(feat_data)[shuffle_indices]
for batch_num in range(num_batches_per_epoch):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_size)
max_sent_len = max([len(sample) for sample in word_data[start_index: end_index]])
batch_data = {'x_word':[], 'x_pos':[], 'x_len':[],'x_feat':[], 'y_tag':[]}
for sent_word, sent_pos, sent_tag, sent_feat in zip(word_data[start_index:end_index], pos_data[start_index:end_index],
tag_data[start_index:end_index], feat_data[start_index:end_index]):
sent_len = len(sent_word)
sent_word = sent_word + [0] * (max_sent_len - sent_len)
sent_pos = sent_pos + [0] * (max_sent_len - sent_len)
sent_tag = sent_tag + [0] * (max_sent_len - sent_len)
feat_pad = [[0] * feat_num]
sent_feat = sent_feat + feat_pad * (max_sent_len - sent_len)
batch_data['x_word'].append(sent_word)
batch_data['x_pos'].append(sent_pos)
batch_data['x_len'].append(sent_len)
batch_data['y_tag'].append(sent_tag)
batch_data['x_feat'].append(sent_feat)
yield batch_data