forked from sordonia/hed-dlg
-
Notifications
You must be signed in to change notification settings - Fork 43
/
data_iterator.py
226 lines (184 loc) · 8.62 KB
/
data_iterator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import numpy as np
import theano
import theano.tensor as T
import sys, getopt
import logging
from state import *
from utils import *
from SS_dataset import *
import itertools
import sys
import pickle
import random
import datetime
logger = logging.getLogger(__name__)
def create_padded_batch(state, x):
mx = state['seqlen']
n = state['bs']
X = numpy.zeros((mx, n), dtype='int32')
Xmask = numpy.zeros((mx, n), dtype='float32')
# Variable to store each utterance in reverse form (for bidirectional RNNs)
X_reversed = numpy.zeros((mx, n), dtype='int32')
# Variables to store last utterance (for computing mutual information metric)
X_last_utterance = numpy.zeros((mx, n), dtype='int32')
X_last_utterance_reversed = numpy.zeros((mx, n), dtype='int32')
Xmask_last_utterance = numpy.zeros((mx, n), dtype='float32')
X_start_of_last_utterance = numpy.zeros((n), dtype='int32')
# Fill X and Xmask
# Keep track of number of predictions and maximum triple length
num_preds = 0
num_preds_last_utterance = 0
max_length = 0
for idx in xrange(len(x[0])):
# Insert sequence idx in a column of matrix X
triple_length = len(x[0][idx])
# Fiddle-it if it is too long ..
if mx < triple_length:
continue
X[:triple_length, idx] = x[0][idx][:triple_length]
max_length = max(max_length, triple_length)
# Set the number of predictions == sum(Xmask), for cost purposes
num_preds += triple_length
# Mark the end of phrase
if len(x[0][idx]) < mx:
X[triple_length:, idx] = state['eos_sym']
# Initialize Xmask column with ones in all positions that
# were just set in X.
# Note: if we need mask to depend on tokens inside X, then we need to
# create a corresponding mask for X_reversed and send it further in the model
Xmask[:triple_length, idx] = 1.
# Reverse all utterances
sos_indices = numpy.where(X[:, idx] == state['sos_sym'])[0]
eos_indices = numpy.where(X[:, idx] == state['eos_sym'])[0]
X_reversed[:triple_length, idx] = x[0][idx][:triple_length]
prev_eos_index = -1
for eos_index in eos_indices:
X_reversed[(prev_eos_index+2):eos_index, idx] = (X_reversed[(prev_eos_index+2):eos_index, idx])[::-1]
prev_eos_index = eos_index
if prev_eos_index > triple_length:
break
# Find start of last utterance and store the utterance
assert (len(eos_indices) >= len(sos_indices))
if len(sos_indices) > 0: # Check that dialogue is not empty
start_of_last_utterance = sos_indices[-1]
else: # If it is empty, then we define last utterance to start at the beginning
start_of_last_utterance = 0
num_preds_last_utterance += triple_length - start_of_last_utterance
X_start_of_last_utterance[idx] = start_of_last_utterance
X_last_utterance[0:(triple_length-start_of_last_utterance), idx] = X[start_of_last_utterance:triple_length, idx]
Xmask_last_utterance[0:(triple_length-start_of_last_utterance), idx] = Xmask[start_of_last_utterance:triple_length, idx]
# Store also the last utterance in reverse
X_last_utterance_reversed[0:(triple_length-start_of_last_utterance), idx] = numpy.copy(X_last_utterance[0:(triple_length-start_of_last_utterance), idx])
X_last_utterance_reversed[1:(triple_length-start_of_last_utterance-1), idx] = (X_last_utterance_reversed[1:(triple_length-start_of_last_utterance-1), idx])[::-1]
assert num_preds == numpy.sum(Xmask)
return {'x': X, \
'x_reversed': X_reversed, \
'x_mask': Xmask, \
'x_last_utterance': X_last_utterance, \
'x_last_utterance_reversed': X_last_utterance_reversed, \
'x_mask_last_utterance': Xmask_last_utterance, \
'x_start_of_last_utterance': X_start_of_last_utterance, \
'num_preds': num_preds, \
'num_preds_at_utterance': num_preds_last_utterance, \
'num_triples': len(x[0]), \
'max_length': max_length \
}
class Iterator(SSIterator):
def __init__(self, triple_file, batch_size, **kwargs):
SSIterator.__init__(self, triple_file, batch_size, \
semantic_file=kwargs.pop('semantic_file', None), \
max_len=kwargs.pop('max_len', -1), \
use_infinite_loop=kwargs.pop('use_infinite_loop', False))
# TODO: max_len should be handled here and SSIterator should zip semantic_data and
# data.
self.k_batches = kwargs.pop('sort_k_batches', 20)
# TODO: For backward compatibility. This should be removed in future versions
# i.e. remove all the x_reversed computations in the model itself.
self.state = kwargs.pop('state', None)
# ----------------
self.batch_iter = None
def get_homogenous_batch_iter(self, batch_size = -1):
while True:
batch_size = self.batch_size if (batch_size == -1) else batch_size
data = []
for k in range(self.k_batches):
batch = SSIterator.next(self)
if batch:
data.append(batch)
if not len(data):
return
number_of_batches = len(data)
data = list(itertools.chain.from_iterable(data))
# Split list of words from the triple index
data_x = []
data_semantic = []
for i in range(len(data)):
data_x.append(data[i][0])
data_semantic.append(data[i][1])
x = numpy.asarray(list(itertools.chain(data_x)))
x_semantic = numpy.asarray(list(itertools.chain(data_semantic)))
lens = numpy.asarray([map(len, x)])
order = numpy.argsort(lens.max(axis=0))
for k in range(number_of_batches):
indices = order[k * batch_size:(k + 1) * batch_size]
batch = create_padded_batch(self.state, [x[indices]])
# Add semantic information to batch; take care to fill with -1 (=n/a) whenever the batch is filled with empty triples
if 'semantic_information_dim' in self.state:
batch['x_semantic'] = - numpy.ones((self.state['bs'], self.state['semantic_information_dim'])).astype('int32')
batch['x_semantic'][0:len(indices), :] = numpy.asarray(list(itertools.chain(x_semantic[indices]))).astype('int32')
else:
batch['x_semantic'] = None
if batch:
yield batch
def start(self):
SSIterator.start(self)
self.batch_iter = None
def next(self, batch_size = -1):
"""
We can specify a batch size,
independent of the object initialization.
"""
if not self.batch_iter:
self.batch_iter = self.get_homogenous_batch_iter(batch_size)
try:
batch = next(self.batch_iter)
except StopIteration:
return None
return batch
def get_train_iterator(state):
semantic_train_path = None
semantic_valid_path = None
if 'train_semantic' in state:
assert state['valid_semantic']
semantic_train_path = state['train_semantic']
semantic_valid_path = state['valid_semantic']
train_data = Iterator(
state['train_triples'],
int(state['bs']),
state=state,
seed=state['seed'],
semantic_file=semantic_train_path,
use_infinite_loop=True,
max_len=state['seqlen'])
valid_data = Iterator(
state['valid_triples'],
int(state['bs']),
state=state,
seed=state['seed'],
semantic_file=semantic_valid_path,
use_infinite_loop=False,
max_len=state['seqlen'])
return train_data, valid_data
def get_test_iterator(state):
assert 'test_triples' in state
test_path = state.get('test_triples')
semantic_test_path = state.get('test_semantic', None)
test_data = Iterator(
test_path,
int(state['bs']),
state=state,
seed=state['seed'],
semantic_file=semantic_test_path,
use_infinite_loop=False,
max_len=state['seqlen'])
return test_data