bug fixes + architecture improvements

Rayhane-mamah · Mar 4, 2018 · 7e67d8b · 7e67d8b
1 parent 82997ee
commit 7e67d8b
Show file tree

Hide file tree

Showing 12 changed files with 556 additions and 282 deletions.
diff --git a/tacotron/hparams.py b/tacotron/hparams.py
@@ -19,29 +19,35 @@
 	cmu_dict=False,
 
 	#Model
-	outputs_per_step = 1,
-	attention_dim = 128,
-	parameter_init = 0.5,
-	sharpening_factor = 1.0,
-	max_decode_length = None,
-	num_classes = None,
-	time_major = False,
-	hidden_dim = 128,
-	embedding_dim = 512,
-	num_decoder_layers=2,
+	outputs_per_step = 1, #number of frames to generate at each decoding step
+	embedding_dim = 512, #dimension of embedding space
+	enc_conv_num_layers=3, #number of encoder convolutional layers
+	enc_conv_kernel_size=(5, ), #size of encoder convolution filters for each layer
+	enc_conv_channels=512, #number of encoder convolutions filters for each layer
+	encoder_lstm_units=256, #number of lstm units for each direction (forward and backward)
+	attention_dim = 128, #dimension of attention space
+	attention_stddev_init = 0.1, #Initial standard deviation for attention projection (normal initializer)
+	prenet_layers=[128, 128], #number of layers and number of units of prenet
+	decoder_layers=2, #number of decoder lstm layers
+	decoder_lstm_units=512, #number of decoder lstm units on each layer
+	postnet_num_layers=5, #number of postnet convolutional layers
+	postnet_kernel_size=(5, ), #size of postnet convolution filters for each layer
+	postnet_channels=512, #number of postnet convolution filters for each layer
 	max_iters=808, #Max decoder steps during inference (feel free to change it)
 
 	#Training
-	batch_size = 32,
-	reg_weight = 10e-6,
-	decay_learning_rate = True,
-	decay_steps = 50000,
-	decay_rate = 0.97,
-	initial_learning_rate = 10e-3,
-	final_learning_rate = 10e-5,
-	adam_beta1 = 0.9,
-	adam_beta2 = 0.999,
-	adam_epsilon = 10e-6,
+	batch_size = 16, #number of training samples on each training steps
+	reg_weight = 10e-6, #regularization weight (for l2 regularization)
+	decay_learning_rate = True, #boolean, determines if the learning rate will follow an exponential decay
+	decay_steps = 50000, #starting point for learning rate decay (and determines the decay slope)
+	decay_rate = 0.97, #learning rate decay rate
+	initial_learning_rate = 10e-3, #starting learning rate
+	final_learning_rate = 10e-5, #minimal learning rate
+	adam_beta1 = 0.9, #AdamOptimizer beta1 parameter
+	adam_beta2 = 0.999, #AdamOptimizer beta2 parameter
+	adam_epsilon = 10e-6, #AdamOptimizer beta3 parameter
+	zoneout_rate=0.1, #zoneout rate for all LSTM cells in the network
+	dropout_rate=0.5, #dropout rate for all convolutional layers + prenet
 
 	#Eval sentences
 	sentences = [
@@ -52,8 +58,22 @@
 	'The Senate\'s bill to repeal and replace the Affordable Care Act is now imperiled.',
 	# From Google's Tacotron example page:
 	'Generative adversarial network or variational auto-encoder.',
-	'The buses aren\'t the problem, they actually provide a solution.',
-	'Does the quick brown fox jump over the lazy dog?',
+	'Basilar membrane and otolaryngology are not auto-correlations.',
+	'He has read the whole thing.',
+	'He reads books.',
+	"Don't desert me here in the desert!",
+	'He thought it was time to present the present.',
+	'Thisss isrealy awhsome.',
+	'Punctuation sensitivity, is working.',
+	'Punctuation sensitivity is working.',
+	"The buses aren't the problem, they actually provide a solution.",
+	"The buses aren't the PROBLEM, they actually provide a SOLUTION.",
+	"The quick brown fox jumps over the lazy dog.",
+	"Does the quick brown fox jump over the lazy dog?",
+	"Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?",
+	"She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.",
+	"The blue lagoon is a nineteen eighty American romance adventure film.",
+	"Tajima Airport serves Toyooka.",
 	'Talib Kweli confirmed to AllHipHop that he will be releasing an album in the next year.',
 	]
 

diff --git a/tacotron/models/attention.py b/tacotron/models/attention.py
@@ -0,0 +1,131 @@
+"""Attention file for location based attention (compatible with tensorflow attention wrapper)"""
+
+import tensorflow as tf
+from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import _BaseAttentionMechanism
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.layers import core as layers_core
+from tensorflow.python.ops import variable_scope
+from hparams import hparams
+
+
+def _location_based_score(W_query, attention_weights, W_keys):
+	"""Impelements Bahdanau-style (cumulative) scoring function.
+	This attention is described in:
+	J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
+  gio, “Attention-based models for speech recognition,” in Ad-
+  vances in Neural Information Processing Systems, 2015, pp.
+  577–585.
+
+  #######################################################################
+            hybrid attention (content-based + location-based)
+        				     f = F * α_{i-1}
+     energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f)))
+  #######################################################################
+
+  Args:
+	W_query: Tensor, shape '[batch_size, num_units]' to compare to location features.
+	attention_weights (alignments): previous attention weights, shape '[batch_size, max_time]'
+  Returns:
+	A '[batch_size, max_time]'
+	"""
+	dtype = W_query.dtype
+	# Get the number of hidden units from the trailing dimension of query
+	num_units = W_query.shape[-1].value or array_ops.shape(W_query)[-1]
+
+	# [batch_size, max_time] -> [batch_size, max_time, 1]
+	attention_weights = tf.expand_dims(attention_weights, axis=2)
+	# location features [batch_size, max_time, filters]
+	f = tf.layers.conv1d(attention_weights, filters=32,
+											kernel_size=31, padding='same',
+											name='location_features')
+
+	# Projected location features [batch_size, max_time, attention_dim]
+	W_fil = tf.contrib.layers.fully_connected(
+		f,
+		num_outputs=num_units,
+		activation_fn=None,
+		weights_initializer=tf.truncated_normal_initializer(
+			stddev=hparams.attention_stddev_init),
+		biases_initializer=tf.zeros_initializer(),
+		scope='W_filter')
+
+	v_a = tf.get_variable(
+		'v_a', shape=[num_units], dtype=tf.float32)
+
+	return tf.reduce_sum(v_a * tf.tanh(W_keys + tf.expand_dims(W_query, axis=1) + W_fil), axis=2)
+
+
+class LocationBasedAttention(_BaseAttentionMechanism):
+	"""Impelements Bahdanau-style (cumulative) scoring function.
+	Usually referred to as "hybrid" attention (content-based + location-based)
+	This attention is described in:
+	J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
+  gio, “Attention-based models for speech recognition,” in Ad-
+  vances in Neural Information Processing Systems, 2015, pp.
+  577–585.
+	"""
+
+	def __init__(self,
+				 num_units,
+				 memory,
+				 memory_sequence_length=None,
+				 probability_fn=None,
+				 score_mask_value=tf.float32.min,
+				 name='LocationBasedAttention'):
+		"""Construct the Attention mechanism.
+		Args:
+			num_units: The depth of the query mechanism.
+			memory: The memory to query; usually the output of an RNN encoder.  This
+				tensor should be shaped `[batch_size, max_time, ...]`.
+			memory_sequence_length (optional): Sequence lengths for the batch entries
+				in memory.  If provided, the memory tensor rows are masked with zeros
+				for values past the respective sequence lengths.
+			probability_fn: (optional) A `callable`.  Converts the score to
+				probabilities.  The default is @{tf.nn.softmax}. Other options include
+				@{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}.
+				Its signature should be: `probabilities = probability_fn(score)`.
+			score_mask_value: (optional): The mask value for score before passing into
+				`probability_fn`. The default is -inf. Only used if
+				`memory_sequence_length` is not None.
+			name: Name to use when creating ops.
+		"""
+		if probability_fn is None:
+			probability_fn = nn_ops.softmax
+		wrapped_probability_fn = lambda score, _: probability_fn(score)
+		super(LocationBasedAttention, self).__init__(
+				query_layer=layers_core.Dense(
+						num_units, name='query_layer', use_bias=False),
+				memory_layer=layers_core.Dense(
+						num_units, name='memory_layer', use_bias=False),
+				memory=memory,
+				probability_fn=wrapped_probability_fn,
+				memory_sequence_length=memory_sequence_length,
+				score_mask_value=score_mask_value,
+				name=name)
+		self._num_units = num_units
+		self._name = name
+
+	def __call__(self, query, state):
+		"""Score the query based on the keys and values.
+		Args:
+			query: Tensor of dtype matching `self.values` and shape
+				`[batch_size, query_depth]`.
+			previous_alignments: Tensor of dtype matching `self.values` and shape
+				`[batch_size, alignments_size]`
+				(`alignments_size` is memory's `max_time`).
+		Returns:
+			alignments: Tensor of dtype matching `self.values` and shape
+				`[batch_size, alignments_size]` (`alignments_size` is memory's
+				`max_time`).
+		"""
+		previous_alignments = state
+		with variable_scope.variable_scope(None, "location_based_attention", [query]):
+			# processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim]
+			processed_query = self.query_layer(query) if self.query_layer else query
+			# energy shape [batch_size, max_time]
+			energy = _location_based_score(processed_query, previous_alignments, self._keys)
+		# alignments shape = energy shape = [batch_size, max_time]
+		alignments = self._probability_fn(energy, previous_alignments)
+		#Seems pretty useless but tensorflow attention wrapper requires it to work properly
+		next_state = alignments
+		return alignments, next_state