diff --git a/madmom/ml/nn/layers.py b/madmom/ml/nn/layers.py index bb5d499f7..e2efb9cfd 100644 --- a/madmom/ml/nn/layers.py +++ b/madmom/ml/nn/layers.py @@ -344,6 +344,156 @@ def activate(self, data): return out +class GRUCell(object): + """ + Cell as used by GRU layers proposed in [1]_. The cell output is computed by + + .. math:: + h = tanh(W_{xh} * x_t + W_{hh} * h_{t-1} + b). + + Parameters + ---------- + weights : numpy array, shape (num_inputs, num_hiddens) + Weights of the connections between inputs and cell. + recurrent_weights : numpy array, shape (num_hiddens, num_hiddens) + Weights of the connections between cell and cell output of the + previous time step. + bias : scalar or numpy array, shape (num_hiddens,) + Bias. + activation_fn : numpy ufunc, optional + Activation function. + + References + ---------- + .. [1] Kyunghyun Cho, Bart Van Merrienboer, Dzmitry Bahdanau, and Yoshua + Bengio, + "On the properties of neural machine translation: Encoder-decoder + approaches", + http://arxiv.org/abs/1409.1259, 2014. + + Notes + ----- + There are two formulations of the GRUCell in the literature. Here, + we adopted the (slightly older) one proposed in [1]_, which is also + implemented in the Lasagne toolbox. + + """ + + def __init__(self, weights, recurrent_weights, bias, activation_fn=tanh): + self.weights = weights + self.recurrent_weights = recurrent_weights + self.bias = bias + self.activation_fn = activation_fn + + def activate(self, data, reset_gate, prev): + """ + Activate the gate with the given input, reset_gate and the previous + output. + + Parameters + ---------- + data : scalar or numpy array, shape (num_frames, num_inputs) + Input data for the cell. + reset_gate : scalar or numpy array, shape (num_hiddens,) + Activation of the reset gate. + prev : scalar or numpy array, shape (num_hiddens,) + Cell output of the previous time step. + + Returns + ------- + numpy array, shape (num_frames, num_hiddens) + Activations of the gate for this data. + + """ + # weight input and add bias + out = np.dot(data, self.weights) + self.bias + # weight previous cell output and reset gate + out += reset_gate * np.dot(prev, self.recurrent_weights) + # apply activation function and return it + return self.activation_fn(out) + + +class GRULayer(Layer): + """ + Recurrent network layer with Gated Recurrent Units (GRU) as proposed in + [1]_. + + Parameters + ---------- + reset_gate : :class:`Gate` + Reset gate. + update_gate : :class:`Gate` + Update gate. + cell : :class:`GRUCell` + GRU cell + hid_init : numpy array, shape (num_hiddens,), optional + Initial state of hidden units. + + References + ---------- + .. [1] Kyunghyun Cho, Bart Van Merrienboer, Dzmitry Bahdanau, and Yoshua + Bengio, + "On the properties of neural machine translation: Encoder-decoder + approaches", + http://arxiv.org/abs/1409.1259, 2014. + + Notes + ----- + There are two formulations of the GRUCell in the literature. Here, + we adopted the (slightly older) one proposed in [1], which is also + implemented in the Lasagne toolbox. + + """ + + def __init__(self, reset_gate, update_gate, cell, hid_init=None): + # init the gates + self.reset_gate = reset_gate + self.update_gate = update_gate + self.cell = cell + if hid_init is None: + hid_init = np.zeros(cell.bias.size, dtype=NN_DTYPE) + self.hid_init = hid_init + + def activate(self, data): + """ + Activate the GRU layer. + + Parameters + ---------- + data : numpy array, shape (num_frames, num_inputs) + Activate with this data. + + Returns + ------- + numpy array, shape (num_frames, num_hiddens) + Activations for this data. + + """ + # init arrays + size = len(data) + # output matrix for the whole sequence + out = np.zeros((size, self.update_gate.bias.size), dtype=NN_DTYPE) + # output (of the previous time step) + out_ = self.hid_init + # process the input data + for i in range(size): + # cache input data + data_ = data[i] + # reset gate: + # operate on current data and previous output (activation) + rg = self.reset_gate.activate(data_, out_) + # update gate: + # operate on current data and previous output (activation) + ug = self.update_gate.activate(data_, out_) + # hidden_update: + # implemented as proposed in [1] + hug = self.cell.activate(data_, rg, out_) + # output (activation) + out_ = ug * hug + (1 - ug) * out_ + out[i] = out_ + return out + + class ConvolutionalLayer(FeedForwardLayer): """ Convolutional network layer. @@ -506,7 +656,7 @@ class BatchNormLayer(Layer): """ Batch normalization layer with activation function. The previous layer is usually linear with no bias - the BatchNormLayer's beta parameter - replaces it. See [1] for a detailed understanding of the parameters. + replaces it. See [1]_ for a detailed understanding of the parameters. Parameters ---------- diff --git a/tests/test_ml_nn.py b/tests/test_ml_nn.py index f0c7c117e..0084db1f8 100644 --- a/tests/test_ml_nn.py +++ b/tests/test_ml_nn.py @@ -73,6 +73,75 @@ def test_cnn(self): 0.84207922, 0.21631248])) +class TestGRUClass(unittest.TestCase): + + W_xr = np.array([[-0.42948743, -1.29989187], + [0.77213901, 0.86070993], + [1.13791823, -0.87066225]]) + W_xu = np.array([[0.44875312, 0.07172084], + [-0.24292999, 1.318794], + [1.0270179, 0.16293946]]) + W_xhu = np.array([[0.8812559, 1.35859991], + [1.04311944, -0.25449358], + [-1.09539597, 1.19808424]]) + W_hr = np.array([[0.96696973, 0.1384294], + [-0.09561655, -1.23413809]]) + W_hu = np.array([[0.04664641, 0.59561686], + [1.00325841, -0.11574791]]) + W_hhu = np.array([[1.19742848, 1.07850016], + [0.35234964, -1.45348681]]) + b_r = np.array([1.41851288, -0.39743243]) + b_u = np.array([-0.78729095, 0.83385797]) + b_hu = np.array([1.25143065, -0.97715625]) + + IN = np.array([[0.91298812, -1.47626202, -1.08667502], + [0.49814883, -0.0104938, 0.93869008], + [-1.12282135, 0.3780883, 1.42017503], + [0.62669439, 0.89438929, -0.69354132], + [0.16162221, -1.00166208, 0.23579985]]) + H = np.array([0.02345737, 0.34454183]) + + def setUp(self): + self.reset_gate = layers.Gate( + TestGRUClass.W_xr, TestGRUClass.b_r, TestGRUClass.W_hr, + activation_fn=activations.sigmoid) + self.update_gate = layers.Gate( + TestGRUClass.W_xu, TestGRUClass.b_u, TestGRUClass.W_hu, + activation_fn=activations.sigmoid) + self.gru_cell = layers.GRUCell( + TestGRUClass.W_xhu, TestGRUClass.W_hhu, TestGRUClass.b_hu) + self.gru_1 = layers.GRULayer(self.reset_gate, self.update_gate, + self.gru_cell) + self.gru_2 = layers.GRULayer(self.reset_gate, self.update_gate, + self.gru_cell, hid_init=TestGRUClass.H) + + def test_process(self): + self.assertTrue( + np.allclose(self.reset_gate.activate(TestGRUClass.IN[0, :], + TestGRUClass.H), np.array([0.20419282, 0.08861294]))) + self.assertTrue( + np.allclose(self.update_gate.activate(TestGRUClass.IN[0, :], + TestGRUClass.H), np.array([0.31254834, 0.2226105]))) + self.assertTrue( + np.allclose(self.gru_cell.activate(TestGRUClass.IN[0, :], + TestGRUClass.H, TestGRUClass.H), + np.array([0.9366396, -0.67876764]))) + self.assertTrue( + np.allclose(self.gru_1.activate(TestGRUClass.IN), + np.array([[0.22772433, -0.13181415], + [0.49479958, 0.51224858], + [0.08539771, -0.56119639], + [0.1946809, -0.50421363], + [0.17403202, -0.27258521]]))) + self.assertTrue( + np.allclose(self.gru_2.activate(TestGRUClass.IN), + np.array([[0.30988133, 0.13258138], + [0.60639685, 0.55714613], + [0.21366976, -0.55568963], + [0.30860096, -0.43686554], + [0.28866628, -0.23025239]]))) + + class TestBatchNormLayerClass(unittest.TestCase): IN = np.array([[[0.32400414, 0.31483042],