Skip to content

Commit

Permalink
bug fixes + architecture improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
Rayhane-mamah authored Mar 4, 2018
1 parent 82997ee commit 7e67d8b
Show file tree
Hide file tree
Showing 12 changed files with 556 additions and 282 deletions.
64 changes: 42 additions & 22 deletions tacotron/hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,29 +19,35 @@
cmu_dict=False,

#Model
outputs_per_step = 1,
attention_dim = 128,
parameter_init = 0.5,
sharpening_factor = 1.0,
max_decode_length = None,
num_classes = None,
time_major = False,
hidden_dim = 128,
embedding_dim = 512,
num_decoder_layers=2,
outputs_per_step = 1, #number of frames to generate at each decoding step
embedding_dim = 512, #dimension of embedding space
enc_conv_num_layers=3, #number of encoder convolutional layers
enc_conv_kernel_size=(5, ), #size of encoder convolution filters for each layer
enc_conv_channels=512, #number of encoder convolutions filters for each layer
encoder_lstm_units=256, #number of lstm units for each direction (forward and backward)
attention_dim = 128, #dimension of attention space
attention_stddev_init = 0.1, #Initial standard deviation for attention projection (normal initializer)
prenet_layers=[128, 128], #number of layers and number of units of prenet
decoder_layers=2, #number of decoder lstm layers
decoder_lstm_units=512, #number of decoder lstm units on each layer
postnet_num_layers=5, #number of postnet convolutional layers
postnet_kernel_size=(5, ), #size of postnet convolution filters for each layer
postnet_channels=512, #number of postnet convolution filters for each layer
max_iters=808, #Max decoder steps during inference (feel free to change it)

#Training
batch_size = 32,
reg_weight = 10e-6,
decay_learning_rate = True,
decay_steps = 50000,
decay_rate = 0.97,
initial_learning_rate = 10e-3,
final_learning_rate = 10e-5,
adam_beta1 = 0.9,
adam_beta2 = 0.999,
adam_epsilon = 10e-6,
batch_size = 16, #number of training samples on each training steps
reg_weight = 10e-6, #regularization weight (for l2 regularization)
decay_learning_rate = True, #boolean, determines if the learning rate will follow an exponential decay
decay_steps = 50000, #starting point for learning rate decay (and determines the decay slope)
decay_rate = 0.97, #learning rate decay rate
initial_learning_rate = 10e-3, #starting learning rate
final_learning_rate = 10e-5, #minimal learning rate
adam_beta1 = 0.9, #AdamOptimizer beta1 parameter
adam_beta2 = 0.999, #AdamOptimizer beta2 parameter
adam_epsilon = 10e-6, #AdamOptimizer beta3 parameter
zoneout_rate=0.1, #zoneout rate for all LSTM cells in the network
dropout_rate=0.5, #dropout rate for all convolutional layers + prenet

#Eval sentences
sentences = [
Expand All @@ -52,8 +58,22 @@
'The Senate\'s bill to repeal and replace the Affordable Care Act is now imperiled.',
# From Google's Tacotron example page:
'Generative adversarial network or variational auto-encoder.',
'The buses aren\'t the problem, they actually provide a solution.',
'Does the quick brown fox jump over the lazy dog?',
'Basilar membrane and otolaryngology are not auto-correlations.',
'He has read the whole thing.',
'He reads books.',
"Don't desert me here in the desert!",
'He thought it was time to present the present.',
'Thisss isrealy awhsome.',
'Punctuation sensitivity, is working.',
'Punctuation sensitivity is working.',
"The buses aren't the problem, they actually provide a solution.",
"The buses aren't the PROBLEM, they actually provide a SOLUTION.",
"The quick brown fox jumps over the lazy dog.",
"Does the quick brown fox jump over the lazy dog?",
"Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?",
"She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.",
"The blue lagoon is a nineteen eighty American romance adventure film.",
"Tajima Airport serves Toyooka.",
'Talib Kweli confirmed to AllHipHop that he will be releasing an album in the next year.',
]

Expand Down
131 changes: 131 additions & 0 deletions tacotron/models/attention.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
"""Attention file for location based attention (compatible with tensorflow attention wrapper)"""

import tensorflow as tf
from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import _BaseAttentionMechanism
from tensorflow.python.ops import nn_ops
from tensorflow.python.layers import core as layers_core
from tensorflow.python.ops import variable_scope
from hparams import hparams


def _location_based_score(W_query, attention_weights, W_keys):
"""Impelements Bahdanau-style (cumulative) scoring function.
This attention is described in:
J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
gio, “Attention-based models for speech recognition,” in Ad-
vances in Neural Information Processing Systems, 2015, pp.
577–585.
#######################################################################
hybrid attention (content-based + location-based)
f = F * α_{i-1}
energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f)))
#######################################################################
Args:
W_query: Tensor, shape '[batch_size, num_units]' to compare to location features.
attention_weights (alignments): previous attention weights, shape '[batch_size, max_time]'
Returns:
A '[batch_size, max_time]'
"""
dtype = W_query.dtype
# Get the number of hidden units from the trailing dimension of query
num_units = W_query.shape[-1].value or array_ops.shape(W_query)[-1]

# [batch_size, max_time] -> [batch_size, max_time, 1]
attention_weights = tf.expand_dims(attention_weights, axis=2)
# location features [batch_size, max_time, filters]
f = tf.layers.conv1d(attention_weights, filters=32,
kernel_size=31, padding='same',
name='location_features')

# Projected location features [batch_size, max_time, attention_dim]
W_fil = tf.contrib.layers.fully_connected(
f,
num_outputs=num_units,
activation_fn=None,
weights_initializer=tf.truncated_normal_initializer(
stddev=hparams.attention_stddev_init),
biases_initializer=tf.zeros_initializer(),
scope='W_filter')

v_a = tf.get_variable(
'v_a', shape=[num_units], dtype=tf.float32)

return tf.reduce_sum(v_a * tf.tanh(W_keys + tf.expand_dims(W_query, axis=1) + W_fil), axis=2)


class LocationBasedAttention(_BaseAttentionMechanism):
"""Impelements Bahdanau-style (cumulative) scoring function.
Usually referred to as "hybrid" attention (content-based + location-based)
This attention is described in:
J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
gio, “Attention-based models for speech recognition,” in Ad-
vances in Neural Information Processing Systems, 2015, pp.
577–585.
"""

def __init__(self,
num_units,
memory,
memory_sequence_length=None,
probability_fn=None,
score_mask_value=tf.float32.min,
name='LocationBasedAttention'):
"""Construct the Attention mechanism.
Args:
num_units: The depth of the query mechanism.
memory: The memory to query; usually the output of an RNN encoder. This
tensor should be shaped `[batch_size, max_time, ...]`.
memory_sequence_length (optional): Sequence lengths for the batch entries
in memory. If provided, the memory tensor rows are masked with zeros
for values past the respective sequence lengths.
probability_fn: (optional) A `callable`. Converts the score to
probabilities. The default is @{tf.nn.softmax}. Other options include
@{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}.
Its signature should be: `probabilities = probability_fn(score)`.
score_mask_value: (optional): The mask value for score before passing into
`probability_fn`. The default is -inf. Only used if
`memory_sequence_length` is not None.
name: Name to use when creating ops.
"""
if probability_fn is None:
probability_fn = nn_ops.softmax
wrapped_probability_fn = lambda score, _: probability_fn(score)
super(LocationBasedAttention, self).__init__(
query_layer=layers_core.Dense(
num_units, name='query_layer', use_bias=False),
memory_layer=layers_core.Dense(
num_units, name='memory_layer', use_bias=False),
memory=memory,
probability_fn=wrapped_probability_fn,
memory_sequence_length=memory_sequence_length,
score_mask_value=score_mask_value,
name=name)
self._num_units = num_units
self._name = name

def __call__(self, query, state):
"""Score the query based on the keys and values.
Args:
query: Tensor of dtype matching `self.values` and shape
`[batch_size, query_depth]`.
previous_alignments: Tensor of dtype matching `self.values` and shape
`[batch_size, alignments_size]`
(`alignments_size` is memory's `max_time`).
Returns:
alignments: Tensor of dtype matching `self.values` and shape
`[batch_size, alignments_size]` (`alignments_size` is memory's
`max_time`).
"""
previous_alignments = state
with variable_scope.variable_scope(None, "location_based_attention", [query]):
# processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim]
processed_query = self.query_layer(query) if self.query_layer else query
# energy shape [batch_size, max_time]
energy = _location_based_score(processed_query, previous_alignments, self._keys)
# alignments shape = energy shape = [batch_size, max_time]
alignments = self._probability_fn(energy, previous_alignments)
#Seems pretty useless but tensorflow attention wrapper requires it to work properly
next_state = alignments
return alignments, next_state
Loading

0 comments on commit 7e67d8b

Please sign in to comment.