Merge pull request #4 from fchollet/master

update master
keras-team · Dec 16, 2015 · ca37f96 · ca37f96
2 parents 5e06aa5 + 42b3d37
commit ca37f96
Show file tree

Hide file tree

Showing 9 changed files with 408 additions and 52 deletions.
diff --git a/docs/autogen.py b/docs/autogen.py
@@ -80,6 +80,10 @@ def get_method_signature(method):
     for a in args:
         st += str(a) + ', '
     for a, v in kwargs:
+        if  type(v) == str:
+            v = '\'' + v + '\''
+        elif type(v) == unicode:
+            v = 'u\'' + v + '\''
         st += str(a) + '=' + str(v) + ', '
     if kwargs or args:
         return st[:-2] + ')'

diff --git a/keras/backend/theano_backend.py b/keras/backend/theano_backend.py
@@ -412,7 +412,7 @@ def _step(input, *states):
         if masking:
             # if all-zero input timestep, return
             # all-zero output and unchanged states
-            switch = T.any(input)
+            switch = T.any(input, axis=-1, keepdims=True)
             output = T.switch(switch, output, 0. * output)
             return_states = []
             for state, new_state in zip(states, new_states):

diff --git a/keras/layers/containers.py b/keras/layers/containers.py
@@ -23,16 +23,41 @@ def __init__(self, layers=[]):
         self.layer_cache = {}
         for layer in layers:
             self.add(layer)
-
-    def __call__(self, X, train=False):
+        self._cache_enabled = True
+
+    def __call__(self, X, mask=None, train=False):
+        # turn off layer cache temporarily
+        tmp_cache_enabled = self.cache_enabled
+        self.cache_enabled = False
+        # recursively search for a layer which is not a Sequential model
+        layer = self
+        while issubclass(layer.__class__, Sequential):
+            layer = layer.layers[0]
         # set temporary input to first layer
-        tmp = self.layers[0].get_input
-        self.layers[0].get_input = lambda _: X
+        tmp_input = layer.get_input
+        tmp_mask = None
+        layer.get_input = lambda _: X
+        if hasattr(layer, 'get_input_mask'):
+            tmp_mask = layer.get_input_mask
+            layer.get_input_mask = lambda _: mask
         Y = self.get_output(train=train)
-        # return input to first layer to what it was
-        self.layers[0].get_input = tmp
+        # return input from first layer to what it was
+        layer.get_input = tmp_input
+        if hasattr(layer, 'get_input_mask'):
+            layer.get_input_mask = tmp_mask
+        self.cache_enabled = tmp_cache_enabled
         return Y
 
+    @property
+    def cache_enabled(self):
+        return self._cache_enabled
+
+    @cache_enabled.setter
+    def cache_enabled(self, value):
+        self._cache_enabled = value
+        for l in self.layers:
+            l.cache_enabled = value
+
     def set_previous(self, layer):
         self.layers[0].previous = layer
 
@@ -375,9 +400,7 @@ def add_shared_node(self, layer, name, inputs=[], merge_mode=None,
             dot_axes: Same meaning as `dot_axes` argument of `add_node()`
             outputs: Used when `merge_mode=None`. Names for the output nodes.
             create_output: Same meaning as `create_output` argument of `add_node()`.
-                When creating an output, `merge_mode` must be specified.
         '''
-        layer.layer_cache = self.layer_cache
         if name in self.namespace:
             raise Exception('Duplicate node identifier: ' + name)
         for o in outputs:
@@ -408,7 +431,8 @@ def add_shared_node(self, layer, name, inputs=[], merge_mode=None,
                 raise Exception('Unknown identifier: ' + input)
         s = Siamese(layer, layers, merge_mode,
                     concat_axis=concat_axis,
-                    dot_axes=dot_axes)
+                    dot_axes=dot_axes,
+                    is_graph=True)
         self.namespace.add(name)
         self.nodes[name] = s
         self.node_config.append({'name': name,
@@ -425,7 +449,7 @@ def add_shared_node(self, layer, name, inputs=[], merge_mode=None,
                 self.namespace.add(sh_name)
                 self.nodes[sh_name] = sh
                 self.node_config.append({'name': sh_name,
-                                         'inputs': [s],
+                                         'inputs': [name],
                                          'create_output': create_output})
                 if create_output:
                     self.add_output(sh_name, input=sh_name)

diff --git a/keras/layers/core.py b/keras/layers/core.py
@@ -35,7 +35,8 @@ class Layer(object):
     def __init__(self, **kwargs):
         allowed_kwargs = {'input_shape',
                           'trainable',
-                          'batch_input_shape'}
+                          'batch_input_shape',
+                          'cache_enabled'}
         for kwarg in kwargs:
             assert kwarg in allowed_kwargs, "Keyword argument not understood: " + kwarg
         if 'input_shape' in kwargs:
@@ -46,14 +47,31 @@ def __init__(self, **kwargs):
             self._trainable = kwargs['trainable']
         if not hasattr(self, 'params'):
             self.params = []
+        self._cache_enabled = True
+        if 'cache_enabled' in kwargs:
+            self._cache_enabled = kwargs['cache_enabled']
 
-    def __call__(self, X, train=False):
+    @property
+    def cache_enabled(self):
+        return self._cache_enabled
+
+    @cache_enabled.setter
+    def cache_enabled(self, value):
+        self._cache_enabled = value
+
+    def __call__(self, X, mask=None, train=False):
         # set temporary input
-        tmp = self.get_input
+        tmp_input = self.get_input
+        tmp_mask = None
+        if hasattr(self, 'get_input_mask'):
+            tmp_mask = self.get_input_mask
+            self.get_input_mask = lambda _: mask
         self.get_input = lambda _: X
         Y = self.get_output(train=train)
         # return input to what it was
-        self.get_input = tmp
+        if hasattr(self, 'get_input_mask'):
+            self.get_input_mask = tmp_mask
+        self.get_input = tmp_input
         return Y
 
     def set_previous(self, layer, connection_map={}):
@@ -132,12 +150,12 @@ def get_input(self, train=False):
         if hasattr(self, 'previous'):
             # to avoid redundant computations,
             # layer outputs are cached when possible.
-            if hasattr(self, 'layer_cache'):
+            if hasattr(self, 'layer_cache') and self.cache_enabled:
                 previous_layer_id = '%s_%s' % (id(self.previous), train)
                 if previous_layer_id in self.layer_cache:
                     return self.layer_cache[previous_layer_id]
             previous_output = self.previous.get_output(train=train)
-            if hasattr(self, 'layer_cache'):
+            if hasattr(self, 'layer_cache') and self.cache_enabled:
                 previous_layer_id = '%s_%s' % (id(self.previous), train)
                 self.layer_cache[previous_layer_id] = previous_output
             return previous_output
@@ -212,6 +230,7 @@ def get_config(self):
             config['input_shape'] = self._input_shape[1:]
         if hasattr(self, '_trainable'):
             config['trainable'] = self._trainable
+        config['cache_enabled'] =  self.cache_enabled
         return config
 
     def get_params(self):
@@ -458,6 +477,7 @@ def __init__(self, layers, mode='sum', concat_axis=-1, dot_axes=-1):
                 if p not in self.params:
                     self.params.append(p)
                     self.constraints.append(c)
+        super(Merge, self).__init__()
 
     @property
     def output_shape(self):
@@ -1285,6 +1305,7 @@ def __init__(self, function, output_shape=None, **kwargs):
                 self._output_shape = marshal.dumps(output_shape.__code__)
             else:
                 self._output_shape = marshal.dumps(output_shape.func_code)
+        super(Lambda, self).__init__()
 
     @property
     def output_shape(self):
@@ -1359,6 +1380,7 @@ def __init__(self, layers, function, output_shape=None):
                 self._output_shape = marshal.dumps(output_shape.__code__)
             else:
                 self._output_shape = marshal.dumps(output_shape.func_code)
+        super(Lambda, self).__init__()
 
     @property
     def output_shape(self):
@@ -1442,9 +1464,10 @@ class Siamese(Layer):
         merge_mode: Same meaning as `mode` argument of Merge layer
         concat_axis: Same meaning as `concat_axis` argument of Merge layer
         dot_axes: Same meaning as `dot_axes` argument of Merge layer
+        is_graph: Should be set to True when used inside `Graph`
     '''
     def __init__(self, layer, inputs, merge_mode='concat',
-                 concat_axis=1, dot_axes=-1):
+                 concat_axis=1, dot_axes=-1, is_graph=False):
         if merge_mode not in ['sum', 'mul', 'concat', 'ave',
                               'join', 'cos', 'dot', None]:
             raise Exception('Invalid merge mode: ' + str(merge_mode))
@@ -1454,17 +1477,19 @@ def __init__(self, layer, inputs, merge_mode='concat',
                 raise Exception(merge_mode + ' merge takes exactly 2 layers')
 
         self.layer = layer
+        self.trainable = layer.trainable
+        self.is_graph = is_graph
         self.inputs = inputs
-        self.params = []
+        self.layer.set_previous(inputs[0])
         self.merge_mode = merge_mode
         self.concat_axis = concat_axis
         self.dot_axes = dot_axes
-        layer.set_previous(inputs[0])
+        self.params = []
         self.regularizers = []
         self.constraints = []
         self.updates = []
         layers = [layer]
-        if merge_mode:
+        if merge_mode and not is_graph:
             layers += inputs
         for l in layers:
             params, regs, consts, updates = l.get_params()
@@ -1475,6 +1500,7 @@ def __init__(self, layer, inputs, merge_mode='concat',
                 if p not in self.params:
                     self.params.append(p)
                     self.constraints.append(c)
+        super(Siamese, self).__init__()
 
     @property
     def output_shape(self):
@@ -1512,15 +1538,18 @@ def output_shape(self):
     def get_params(self):
         return self.params, self.regularizers, self.constraints, self.updates
 
-    def set_layer_input(self, index):
-        l = self.layer
-        while not hasattr(l, 'previous'):
-            l = l.layers[0]
-        l.previous = self.inputs[index]
+    def set_layer_input(self, head):
+        layer = self.layer
+        from ..layers.containers import Sequential
+        while issubclass(layer.__class__, Sequential):
+            layer = layer.layers[0]
+        layer.previous = self.inputs[head]
 
     def get_output_at(self, head, train=False):
-        self.set_layer_input(head)
-        return self.layer.get_output(train)
+        X = self.inputs[head].get_output(train)
+        mask = self.inputs[head].get_output_mask(train)
+        Y = self.layer(X, mask)
+        return Y
 
     def get_output_shape(self, head, train=False):
         self.set_layer_input(head)
@@ -1621,7 +1650,7 @@ def get_output_mask(self, train=None):
 
     def get_weights(self):
         weights = self.layer.get_weights()
-        if self.merge_mode:
+        if self.merge_mode and not self.is_graph:
             for m in self.inputs:
                 weights += m.get_weights()
         return weights
@@ -1630,7 +1659,7 @@ def set_weights(self, weights):
         nb_param = len(self.layer.params)
         self.layer.set_weights(weights[:nb_param])
         weights = weights[nb_param:]
-        if self.merge_mode:
+        if self.merge_mode and not self.is_graph:
             for i in range(len(self.inputs)):
                 nb_param = len(self.inputs[i].params)
                 self.inputs[i].set_weights(weights[:nb_param])
@@ -1642,7 +1671,8 @@ def get_config(self):
                   'inputs': [m.get_config() for m in self.inputs],
                   'merge_mode': self.merge_mode,
                   'concat_axis': self.concat_axis,
-                  'dot_axes': self.dot_axes}
+                  'dot_axes': self.dot_axes,
+                  'is_graph': self.is_graph}
         base_config = super(Siamese, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
@@ -1661,6 +1691,7 @@ class SiameseHead(Layer):
     def __init__(self, head):
         self.head = head
         self.params = []
+        super(SiameseHead, self).__init__()
 
     def get_output(self, train=False):
         return self.get_input(train)

diff --git a/keras/preprocessing/sequence.py b/keras/preprocessing/sequence.py
@@ -6,7 +6,7 @@
 
 def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0.):
     """
-        Pad each sequence to the same length: 
+        Pad each sequence to the same length:
         the length of the longest sequence.
 
         If maxlen is provided, any sequence longer
@@ -15,6 +15,19 @@ def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncati
 
         Supports post-padding and pre-padding (default).
 
+        Parameters:
+        -----------
+        sequences: list of lists where each element is a sequence
+        maxlen: int, maximum length
+        dtype: type to cast the resulting sequence.
+        padding: 'pre' or 'post', pad either before or after each sequence.
+        truncating: 'pre' or 'post', remove values from sequences larger than
+            maxlen either in the beginning or in the end of the sequence
+        value: float, value to pad the sequences to the desired value.
+
+        Returns:
+        x: numpy array with dimensions (number_of_sequences, maxlen)
+
     """
     lengths = [len(s) for s in sequences]
 
@@ -47,39 +60,53 @@ def make_sampling_table(size, sampling_factor=1e-5):
         This generates an array where the ith element
         is the probability that a word of rank i would be sampled,
         according to the sampling distribution used in word2vec.
-        
+
         The word2vec formula is:
             p(word) = min(1, sqrt(word.frequency/sampling_factor) / (word.frequency/sampling_factor))
 
-        We assume that the word frequencies follow Zipf's law (s=1) to derive 
+        We assume that the word frequencies follow Zipf's law (s=1) to derive
         a numerical approximation of frequency(rank):
            frequency(rank) ~ 1/(rank * (log(rank) + gamma) + 1/2 - 1/(12*rank))
         where gamma is the Euler-Mascheroni constant.
+
+        Parameters:
+        -----------
+        size: int, number of possible words to sample. 
     '''
     gamma = 0.577
     rank = np.array(list(range(size)))
     rank[0] = 1
     inv_fq = rank * (np.log(rank) + gamma) + 0.5 - 1./(12.*rank)
     f = sampling_factor * inv_fq
+
     return np.minimum(1., f / np.sqrt(f))
 
 
-def skipgrams(sequence, vocabulary_size, 
-    window_size=4, negative_samples=1., shuffle=True, 
-    categorical=False, sampling_table=None):
-    ''' 
-        Take a sequence (list of indexes of words), 
+def skipgrams(sequence, vocabulary_size,
+              window_size=4, negative_samples=1., shuffle=True,
+              categorical=False, sampling_table=None):
+    '''
+        Take a sequence (list of indexes of words),
         returns couples of [word_index, other_word index] and labels (1s or 0s),
         where label = 1 if 'other_word' belongs to the context of 'word',
         and label=0 if 'other_word' is ramdomly sampled
 
-        @param vocabulary_size: int. maximum possible word index + 1
-        @param window_size: int. actually half-window. The window of a word wi will be [i-window_size, i+window_size+1]
-        @param negative_samples: float >= 0. 0 for no negative (=random) samples. 1 for same number as positive samples. etc.
-        @param categorical: bool. if False, labels will be integers (eg. [0, 1, 1 .. ]), 
+        Paramaters:
+        -----------
+        vocabulary_size: int. maximum possible word index + 1
+        window_size: int. actually half-window. The window of a word wi will be [i-window_size, i+window_size+1]
+        negative_samples: float >= 0. 0 for no negative (=random) samples. 1 for same number as positive samples. etc.
+        categorical: bool. if False, labels will be integers (eg. [0, 1, 1 .. ]),
             if True labels will be categorical eg. [[1,0],[0,1],[0,1] .. ]
 
-        Note: by convention, index 0 in the vocabulary is a non-word and will be skipped.
+        Returns:
+        --------
+        couples, lables: where `couples` are int pairs and
+            `labels` are either 0 or 1.
+
+        Notes:
+        ------
+        By convention, index 0 in the vocabulary is a non-word and will be skipped.
     '''
     couples = []
     labels = []