Merge branch 'julianser-iterator_refactor'

julianser · May 21, 2015 · 69a1fe5 · 69a1fe5
2 parents f27c953 + 94cb8f2
commit 69a1fe5
Show file tree

Hide file tree

Showing 4 changed files with 230 additions and 119 deletions.
diff --git a/SS_dataset.py b/SS_dataset.py
@@ -1,4 +1,4 @@
-import numpy as np
+import numpy 
 import os, gc
 import cPickle
 import copy
@@ -10,21 +10,19 @@
 import collections
 
 logger = logging.getLogger(__name__)
-np.random.seed(1234)
 
 class SSFetcher(threading.Thread):
     def __init__(self, parent):
         threading.Thread.__init__(self)
         self.parent = parent
-        self.indexes = np.arange(parent.data_len)
+        self.rng = numpy.random.RandomState(self.parent.seed)
+        self.indexes = numpy.arange(parent.data_len)
 
     def run(self):
         diter = self.parent
-        # Shuffle with parents random generator
-        self.parent.rng.shuffle(self.indexes)
+        self.rng.shuffle(self.indexes)
 
-        offset = 0
-        # Take groups of 10000 triples and group by length
+        offset = 0 
         while not diter.exit_flag:
             last_batch = False
             triples = []
@@ -37,16 +35,16 @@ def run(self):
                     else:
                         # Infinite loop here, we reshuffle the indexes
                         # and reset the offset
-                        self.parent.rng.shuffle(self.indexes)
+                        self.rng.shuffle(self.indexes)
                         offset = 0
 
                 index = self.indexes[offset]
                 s = diter.data[index]
                 offset += 1
 
                 # Append only if it is shorter than max_len
-                if len(s) <= diter.max_len:
-                    if not diter.semantic_data == None:
+                if diter.max_len == -1 or len(s) <= diter.max_len:
+                    if diter.semantic_file is not None:
                         triples.append([s, diter.semantic_data[index]])
                     else:
                         # Append 'None' to the triple if there is no semantic information
@@ -61,22 +59,20 @@ def run(self):
 
 class SSIterator(object):
     def __init__(self,
-                 rng,
+                 triple_file,
                  batch_size,
-                 triple_file=None,
                  semantic_file=None,
-                 dtype="int32",
-                 can_fit=False,
-                 queue_size=100,
-                 cache_size=100,
-                 shuffle=True,
+                 seed=1234,
+                 max_len=-1,
                  use_infinite_loop=True,
-                 max_len=1000):
+                 dtype="int32"):
+
+        self.triple_file = triple_file
+        self.batch_size = batch_size
 
         args = locals()
         args.pop("self")
         self.__dict__.update(args)
-        self.rng = rng
         self.load_files()
         self.exit_flag = False
 
@@ -85,19 +81,16 @@ def load_files(self):
         self.data_len = len(self.data)
         logger.debug('Data len is %d' % self.data_len)
 
-        if not self.semantic_file == None:
+        if self.semantic_file:
             self.semantic_data = cPickle.load(open(self.semantic_file, 'r'))
             self.semantic_data_len = len(self.semantic_data)
             logger.debug('Semantic data len is %d' % self.semantic_data_len)
-
             # We need to have as many semantic labels as we have triples
             assert self.semantic_data_len == self.data_len 
-        else:
-            self.semantic_data = None
 
     def start(self):
         self.exit_flag = False
-        self.queue = Queue.Queue(maxsize=self.queue_size)
+        self.queue = Queue.Queue(maxsize=1000)
         self.gather = SSFetcher(self)
         self.gather.daemon = True
         self.gather.start()

diff --git a/data_iterator.py b/data_iterator.py
@@ -88,111 +88,134 @@ def create_padded_batch(state, x):
         # Store also the last utterance in reverse
         X_last_utterance_reversed[0:(triple_length-start_of_last_utterance), idx] = numpy.copy(X_last_utterance[0:(triple_length-start_of_last_utterance), idx])
         X_last_utterance_reversed[1:(triple_length-start_of_last_utterance-1), idx] = (X_last_utterance_reversed[1:(triple_length-start_of_last_utterance-1), idx])[::-1]
-
 
     assert num_preds == numpy.sum(Xmask)
-    return {'x': X, 'x_reversed': X_reversed, 'x_mask': Xmask, 'x_last_utterance': X_last_utterance, 'x_last_utterance_reversed': X_last_utterance_reversed, 'x_mask_last_utterance': Xmask_last_utterance, 'x_start_of_last_utterance': X_start_of_last_utterance, 'num_preds': num_preds, 'max_length': max_length}
-
-def get_batch_iterator(rng, state):
-    class Iterator(SSIterator):
-        def __init__(self, *args, **kwargs):
-            SSIterator.__init__(self, rng, *args, **kwargs)
-            self.batch_iter = None
 
-        def get_homogenous_batch_iter(self, batch_size = -1):
-            while True:
-                k_batches = state['sort_k_batches']
-                batch_size = self.batch_size if (batch_size == -1) else batch_size 
-
-                data = []
-                for k in range(k_batches):
-                    batch = SSIterator.next(self)
-                    if batch:
-                        data.append(batch)
-
-                if not len(data):
-                    return
-
-                number_of_batches = len(data)
-                data = list(itertools.chain.from_iterable(data))
-
-                # Split list of words from the triple index
-                data_x = []
-                data_semantic = []
-                for i in range(len(data)):
-                    data_x.append(data[i][0])
-                    data_semantic.append(data[i][1])
-
-                x = numpy.asarray(list(itertools.chain(data_x)))
-                x_semantic = numpy.asarray(list(itertools.chain(data_semantic)))
-
-                lens = numpy.asarray([map(len, x)])
-                order = numpy.argsort(lens.max(axis=0))
+    return {'x': X,                                                 \
+            'x_reversed': X_reversed,                               \
+            'x_mask': Xmask,                                        \
+            'x_last_utterance': X_last_utterance,                   \
+            'x_last_utterance_reversed': X_last_utterance_reversed, \
+            'x_mask_last_utterance': Xmask_last_utterance,          \
+            'x_start_of_last_utterance': X_start_of_last_utterance, \
+            'num_preds': num_preds,                                 \
+            'max_length': max_length                                \
+           }
+
+class Iterator(SSIterator):
+    def __init__(self, triple_file, batch_size, **kwargs):
+        SSIterator.__init__(self, triple_file, batch_size,                   \
+                            semantic_file=kwargs.pop('semantic_file', None), \
+                            max_len=kwargs.pop('max_len', -1),               \
+                            use_infinite_loop=kwargs.pop('use_infinite_loop', False))
+        # TODO: max_len should be handled here and SSIterator should zip semantic_data and 
+        # data. 
+        self.k_batches = kwargs.pop('sort_k_batches', 20)
+        # TODO: For backward compatibility. This should be removed in future versions
+        # i.e. remove all the x_reversed computations in the model itself.
+        self.state = kwargs.pop('state', None)
+        # ---------------- 
+        self.batch_iter = None
+
+    def get_homogenous_batch_iter(self, batch_size = -1):
+        while True:
+            batch_size = self.batch_size if (batch_size == -1) else batch_size 
+
+            data = []
+            for k in range(self.k_batches):
+                batch = SSIterator.next(self)
+                if batch:
+                    data.append(batch)
+
+            if not len(data):
+                return
+
+            number_of_batches = len(data)
+            data = list(itertools.chain.from_iterable(data))
+
+            # Split list of words from the triple index
+            data_x = []
+            data_semantic = []
+            for i in range(len(data)):
+                data_x.append(data[i][0])
+                data_semantic.append(data[i][1])
+
+            x = numpy.asarray(list(itertools.chain(data_x)))
+            x_semantic = numpy.asarray(list(itertools.chain(data_semantic)))
+
+            lens = numpy.asarray([map(len, x)])
+            order = numpy.argsort(lens.max(axis=0))
 
-                for k in range(number_of_batches):
-                    indices = order[k * batch_size:(k + 1) * batch_size]
-                    batch = create_padded_batch(state, [x[indices]])
-
-                    # Add semantic information to batch; take care to fill with -1 (=n/a) whenever the batch is filled with empty triples
-                    if 'semantic_information_dim' in state:
-                        batch['x_semantic'] = - numpy.ones((state['bs'], state['semantic_information_dim'])).astype('int32')
-                        batch['x_semantic'][0:len(indices), :] = numpy.asarray(list(itertools.chain(x_semantic[indices]))).astype('int32')
-                    else:
-                        batch['x_semantic'] = None
-
-                    if batch:
-                        yield batch
-        
-        def start(self):
-            SSIterator.start(self)
-            self.batch_iter = None
-
-        def next(self, batch_size = -1):
-            """ 
-            We can specify a batch size,
-            independent of the object initialization. 
-            """
-            if not self.batch_iter:
-                self.batch_iter = self.get_homogenous_batch_iter(batch_size)
-            try:
-                batch = next(self.batch_iter)
-            except StopIteration:
-                return None
-            return batch
-
-    semantic_train_file = None
-    semantic_valid_file = None
-    semantic_test_file = None
-
+            for k in range(number_of_batches):
+                indices = order[k * batch_size:(k + 1) * batch_size]
+                batch = create_padded_batch(self.state, [x[indices]])
+
+                # Add semantic information to batch; take care to fill with -1 (=n/a) whenever the batch is filled with empty triples
+                if 'semantic_information_dim' in self.state:
+                    batch['x_semantic'] = - numpy.ones((self.state['bs'], self.state['semantic_information_dim'])).astype('int32')
+                    batch['x_semantic'][0:len(indices), :] = numpy.asarray(list(itertools.chain(x_semantic[indices]))).astype('int32')
+                else:
+                    batch['x_semantic'] = None
+
+                if batch:
+                    yield batch
+
+    def start(self):
+        SSIterator.start(self)
+        self.batch_iter = None
+
+    def next(self, batch_size = -1):
+        """ 
+        We can specify a batch size,
+        independent of the object initialization. 
+        """
+        if not self.batch_iter:
+            self.batch_iter = self.get_homogenous_batch_iter(batch_size)
+        try:
+            batch = next(self.batch_iter)
+        except StopIteration:
+            return None
+        return batch
+
+def get_train_iterator(state):
+    semantic_train_path = None
+    semantic_valid_path = None
+    
     if 'train_semantic' in state:
         assert state['valid_semantic']
-        assert state['test_semantic']
-        semantic_train_file = state['train_semantic']
-        semantic_valid_file = state['valid_semantic']
-        semantic_test_file = state['test_semantic']
-
+        semantic_train_path = state['train_semantic']
+        semantic_valid_path = state['valid_semantic']
+
     train_data = Iterator(
-        batch_size=int(state['bs']),
-        triple_file=state['train_triples'],
-        semantic_file = semantic_train_file,
-        queue_size=100,
+        state['train_triples'],
+        int(state['bs']),
+        state=state,
+        seed=state['seed'],
+        semantic_file=semantic_train_path,
         use_infinite_loop=True,
         max_len=state['seqlen']) 
 
     valid_data = Iterator(
-        batch_size=int(state['bs']),
-        triple_file=state['valid_triples'],
-        semantic_file = semantic_valid_file,
+        state['valid_triples'],
+        int(state['bs']),
+        state=state,
+        seed=state['seed'],
+        semantic_file=semantic_valid_path,
         use_infinite_loop=False,
-        queue_size=100,
         max_len=state['seqlen'])
-
+    return train_data, valid_data 
+
+def get_test_iterator(state):
+    assert 'test_triples' in state
+    test_path = state.get('test_triples')
+    semantic_test_path = state.get('test_semantic', None)
+
     test_data = Iterator(
-        batch_size=int(state['bs']),
-        triple_file=state['test_triples'],
-        semantic_file = semantic_test_file,
+        test_path,
+        int(state['bs']), 
+        state=state,
+        seed=state['seed'],
+        semantic_file=semantic_test_path,
         use_infinite_loop=False,
-        queue_size=100,
         max_len=state['seqlen'])
-
-    return train_data, valid_data, test_data
+    return test_data