Skip to content

Commit

Permalink
Merge branch 'julianser-iterator_refactor'
Browse files Browse the repository at this point in the history
  • Loading branch information
julianser committed May 21, 2015
2 parents f27c953 + 94cb8f2 commit 69a1fe5
Show file tree
Hide file tree
Showing 4 changed files with 230 additions and 119 deletions.
41 changes: 17 additions & 24 deletions SS_dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import numpy as np
import numpy
import os, gc
import cPickle
import copy
Expand All @@ -10,21 +10,19 @@
import collections

logger = logging.getLogger(__name__)
np.random.seed(1234)

class SSFetcher(threading.Thread):
def __init__(self, parent):
threading.Thread.__init__(self)
self.parent = parent
self.indexes = np.arange(parent.data_len)
self.rng = numpy.random.RandomState(self.parent.seed)
self.indexes = numpy.arange(parent.data_len)

def run(self):
diter = self.parent
# Shuffle with parents random generator
self.parent.rng.shuffle(self.indexes)
self.rng.shuffle(self.indexes)

offset = 0
# Take groups of 10000 triples and group by length
offset = 0
while not diter.exit_flag:
last_batch = False
triples = []
Expand All @@ -37,16 +35,16 @@ def run(self):
else:
# Infinite loop here, we reshuffle the indexes
# and reset the offset
self.parent.rng.shuffle(self.indexes)
self.rng.shuffle(self.indexes)
offset = 0

index = self.indexes[offset]
s = diter.data[index]
offset += 1

# Append only if it is shorter than max_len
if len(s) <= diter.max_len:
if not diter.semantic_data == None:
if diter.max_len == -1 or len(s) <= diter.max_len:
if diter.semantic_file is not None:
triples.append([s, diter.semantic_data[index]])
else:
# Append 'None' to the triple if there is no semantic information
Expand All @@ -61,22 +59,20 @@ def run(self):

class SSIterator(object):
def __init__(self,
rng,
triple_file,
batch_size,
triple_file=None,
semantic_file=None,
dtype="int32",
can_fit=False,
queue_size=100,
cache_size=100,
shuffle=True,
seed=1234,
max_len=-1,
use_infinite_loop=True,
max_len=1000):
dtype="int32"):

self.triple_file = triple_file
self.batch_size = batch_size

args = locals()
args.pop("self")
self.__dict__.update(args)
self.rng = rng
self.load_files()
self.exit_flag = False

Expand All @@ -85,19 +81,16 @@ def load_files(self):
self.data_len = len(self.data)
logger.debug('Data len is %d' % self.data_len)

if not self.semantic_file == None:
if self.semantic_file:
self.semantic_data = cPickle.load(open(self.semantic_file, 'r'))
self.semantic_data_len = len(self.semantic_data)
logger.debug('Semantic data len is %d' % self.semantic_data_len)

# We need to have as many semantic labels as we have triples
assert self.semantic_data_len == self.data_len
else:
self.semantic_data = None

def start(self):
self.exit_flag = False
self.queue = Queue.Queue(maxsize=self.queue_size)
self.queue = Queue.Queue(maxsize=1000)
self.gather = SSFetcher(self)
self.gather.daemon = True
self.gather.start()
Expand Down
207 changes: 115 additions & 92 deletions data_iterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,111 +88,134 @@ def create_padded_batch(state, x):
# Store also the last utterance in reverse
X_last_utterance_reversed[0:(triple_length-start_of_last_utterance), idx] = numpy.copy(X_last_utterance[0:(triple_length-start_of_last_utterance), idx])
X_last_utterance_reversed[1:(triple_length-start_of_last_utterance-1), idx] = (X_last_utterance_reversed[1:(triple_length-start_of_last_utterance-1), idx])[::-1]


assert num_preds == numpy.sum(Xmask)
return {'x': X, 'x_reversed': X_reversed, 'x_mask': Xmask, 'x_last_utterance': X_last_utterance, 'x_last_utterance_reversed': X_last_utterance_reversed, 'x_mask_last_utterance': Xmask_last_utterance, 'x_start_of_last_utterance': X_start_of_last_utterance, 'num_preds': num_preds, 'max_length': max_length}

def get_batch_iterator(rng, state):
class Iterator(SSIterator):
def __init__(self, *args, **kwargs):
SSIterator.__init__(self, rng, *args, **kwargs)
self.batch_iter = None

def get_homogenous_batch_iter(self, batch_size = -1):
while True:
k_batches = state['sort_k_batches']
batch_size = self.batch_size if (batch_size == -1) else batch_size

data = []
for k in range(k_batches):
batch = SSIterator.next(self)
if batch:
data.append(batch)

if not len(data):
return

number_of_batches = len(data)
data = list(itertools.chain.from_iterable(data))

# Split list of words from the triple index
data_x = []
data_semantic = []
for i in range(len(data)):
data_x.append(data[i][0])
data_semantic.append(data[i][1])

x = numpy.asarray(list(itertools.chain(data_x)))
x_semantic = numpy.asarray(list(itertools.chain(data_semantic)))

lens = numpy.asarray([map(len, x)])
order = numpy.argsort(lens.max(axis=0))
return {'x': X, \
'x_reversed': X_reversed, \
'x_mask': Xmask, \
'x_last_utterance': X_last_utterance, \
'x_last_utterance_reversed': X_last_utterance_reversed, \
'x_mask_last_utterance': Xmask_last_utterance, \
'x_start_of_last_utterance': X_start_of_last_utterance, \
'num_preds': num_preds, \
'max_length': max_length \
}

class Iterator(SSIterator):
def __init__(self, triple_file, batch_size, **kwargs):
SSIterator.__init__(self, triple_file, batch_size, \
semantic_file=kwargs.pop('semantic_file', None), \
max_len=kwargs.pop('max_len', -1), \
use_infinite_loop=kwargs.pop('use_infinite_loop', False))
# TODO: max_len should be handled here and SSIterator should zip semantic_data and
# data.
self.k_batches = kwargs.pop('sort_k_batches', 20)
# TODO: For backward compatibility. This should be removed in future versions
# i.e. remove all the x_reversed computations in the model itself.
self.state = kwargs.pop('state', None)
# ----------------
self.batch_iter = None

def get_homogenous_batch_iter(self, batch_size = -1):
while True:
batch_size = self.batch_size if (batch_size == -1) else batch_size

data = []
for k in range(self.k_batches):
batch = SSIterator.next(self)
if batch:
data.append(batch)

if not len(data):
return

number_of_batches = len(data)
data = list(itertools.chain.from_iterable(data))

# Split list of words from the triple index
data_x = []
data_semantic = []
for i in range(len(data)):
data_x.append(data[i][0])
data_semantic.append(data[i][1])

x = numpy.asarray(list(itertools.chain(data_x)))
x_semantic = numpy.asarray(list(itertools.chain(data_semantic)))

lens = numpy.asarray([map(len, x)])
order = numpy.argsort(lens.max(axis=0))

for k in range(number_of_batches):
indices = order[k * batch_size:(k + 1) * batch_size]
batch = create_padded_batch(state, [x[indices]])

# Add semantic information to batch; take care to fill with -1 (=n/a) whenever the batch is filled with empty triples
if 'semantic_information_dim' in state:
batch['x_semantic'] = - numpy.ones((state['bs'], state['semantic_information_dim'])).astype('int32')
batch['x_semantic'][0:len(indices), :] = numpy.asarray(list(itertools.chain(x_semantic[indices]))).astype('int32')
else:
batch['x_semantic'] = None

if batch:
yield batch
def start(self):
SSIterator.start(self)
self.batch_iter = None

def next(self, batch_size = -1):
"""
We can specify a batch size,
independent of the object initialization.
"""
if not self.batch_iter:
self.batch_iter = self.get_homogenous_batch_iter(batch_size)
try:
batch = next(self.batch_iter)
except StopIteration:
return None
return batch

semantic_train_file = None
semantic_valid_file = None
semantic_test_file = None

for k in range(number_of_batches):
indices = order[k * batch_size:(k + 1) * batch_size]
batch = create_padded_batch(self.state, [x[indices]])

# Add semantic information to batch; take care to fill with -1 (=n/a) whenever the batch is filled with empty triples
if 'semantic_information_dim' in self.state:
batch['x_semantic'] = - numpy.ones((self.state['bs'], self.state['semantic_information_dim'])).astype('int32')
batch['x_semantic'][0:len(indices), :] = numpy.asarray(list(itertools.chain(x_semantic[indices]))).astype('int32')
else:
batch['x_semantic'] = None

if batch:
yield batch

def start(self):
SSIterator.start(self)
self.batch_iter = None

def next(self, batch_size = -1):
"""
We can specify a batch size,
independent of the object initialization.
"""
if not self.batch_iter:
self.batch_iter = self.get_homogenous_batch_iter(batch_size)
try:
batch = next(self.batch_iter)
except StopIteration:
return None
return batch

def get_train_iterator(state):
semantic_train_path = None
semantic_valid_path = None
if 'train_semantic' in state:
assert state['valid_semantic']
assert state['test_semantic']
semantic_train_file = state['train_semantic']
semantic_valid_file = state['valid_semantic']
semantic_test_file = state['test_semantic']

semantic_train_path = state['train_semantic']
semantic_valid_path = state['valid_semantic']

train_data = Iterator(
batch_size=int(state['bs']),
triple_file=state['train_triples'],
semantic_file = semantic_train_file,
queue_size=100,
state['train_triples'],
int(state['bs']),
state=state,
seed=state['seed'],
semantic_file=semantic_train_path,
use_infinite_loop=True,
max_len=state['seqlen'])

valid_data = Iterator(
batch_size=int(state['bs']),
triple_file=state['valid_triples'],
semantic_file = semantic_valid_file,
state['valid_triples'],
int(state['bs']),
state=state,
seed=state['seed'],
semantic_file=semantic_valid_path,
use_infinite_loop=False,
queue_size=100,
max_len=state['seqlen'])

return train_data, valid_data

def get_test_iterator(state):
assert 'test_triples' in state
test_path = state.get('test_triples')
semantic_test_path = state.get('test_semantic', None)

test_data = Iterator(
batch_size=int(state['bs']),
triple_file=state['test_triples'],
semantic_file = semantic_test_file,
test_path,
int(state['bs']),
state=state,
seed=state['seed'],
semantic_file=semantic_test_path,
use_infinite_loop=False,
queue_size=100,
max_len=state['seqlen'])

return train_data, valid_data, test_data
return test_data
Loading

0 comments on commit 69a1fe5

Please sign in to comment.