attention + typos + voice test

Rayhane-mamah · Mar 10, 2018 · 7393fd5 · 7393fd5
1 parent be09c5a
commit 7393fd5
Show file tree

Hide file tree

Showing 13 changed files with 194 additions and 113 deletions.
diff --git a/tacotron/datasets/feeder.py b/tacotron/datasets/feeder.py
@@ -100,7 +100,6 @@ def _prepare_batch(batch, outputs_per_step):
 	inputs = _prepare_inputs([x[0] for x in batch])
 	input_lengths = np.asarray([len(x[0]) for x in batch], dtype=np.int32)
 	mel_targets = _prepare_targets([x[1] for x in batch], outputs_per_step)
-	#linear_targets = _prepare_targets([x[2] for x in batch], outputs_per_step)
 	return (inputs, input_lengths, mel_targets)
 
 def _prepare_inputs(inputs):

diff --git a/tacotron/datasets/preprocessor.py b/tacotron/datasets/preprocessor.py
@@ -54,12 +54,9 @@ def _process_utterance(out_dir, index, wav_path, text):
 	# Load the audio as numpy array
 	wav = audio.load_wav(wav_path)
 
-	# Compute the linear-scale spectrogram from the wav to calculate n_frames
-	spectrogram = audio.spectrogram(wav).astype(np.float32)
-	n_frames = spectrogram.shape[1]
-
 	# Compute the mel scale spectrogram from the wav
 	mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
+	n_frames = mel_spectrogram.shape[1]
 
 	# Write the spectrogram to disk
 	mel_filename = 'ljspeech-mel-{:05d}.npy'.format(index)

diff --git a/tacotron/griffin_lim_synthesis_example.ipynb b/tacotron/griffin_lim_synthesis_example.ipynb
@@ -0,0 +1,68 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(660, 80)"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "from utils.audio import *\n",
+    "import os\n",
+    "\n",
+    "mel_folder = 'logs-Tacotron'\n",
+    "mel_file = 'ljspeech-mel-prediction-step-1400.npy'\n",
+    "\n",
+    "mel_file = os.path.join(mel_folder, mel_file) \n",
+    "mel_spectro = np.load(mel_file)\n",
+    "mel_spectro.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "wav = inv_mel_spectrogram(mel_spectro.T)\n",
+    "save_wav(wav, 'wav_out/test.wav')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tacotron/hparams.py b/tacotron/hparams.py
@@ -20,9 +20,11 @@
 	ref_level_db=20,
 	fmin=125,
 	fmax=7600,
+	power=1.3,
+	griffin_lim_iters=60,
 
 	#Model
-	outputs_per_step = 1, #number of frames to generate at each decoding step
+	outputs_per_step = 5, #number of frames to generate at each decoding step (speeds up computation and allows for higher batch size)
 	embedding_dim = 512, #dimension of embedding space
 	enc_conv_num_layers=3, #number of encoder convolutional layers
 	enc_conv_kernel_size=(5, ), #size of encoder convolution filters for each layer
@@ -35,19 +37,19 @@
 	postnet_num_layers=5, #number of postnet convolutional layers
 	postnet_kernel_size=(5, ), #size of postnet convolution filters for each layer
 	postnet_channels=512, #number of postnet convolution filters for each layer
-	max_iters=810, #Max decoder steps during inference (feel free to change it)
+	max_iters=175, #Max decoder steps during inference (feel free to change it)
 
 	#Training
 	batch_size = 32, #number of training samples on each training steps
-	reg_weight = 10e-6, #regularization weight (for l2 regularization)
+	reg_weight = 10**(-6), #regularization weight (for l2 regularization)
 	decay_learning_rate = True, #boolean, determines if the learning rate will follow an exponential decay
 	decay_steps = 50000, #starting point for learning rate decay (and determines the decay slope)
-	decay_rate = 0.97, #learning rate decay rate
-	initial_learning_rate = 10e-3, #starting learning rate
-	final_learning_rate = 10e-5, #minimal learning rate
+	decay_rate = 0.4, #learning rate decay rate
+	initial_learning_rate = 10**(-3), #starting learning rate
+	final_learning_rate = 10**(-5), #minimal learning rate
 	adam_beta1 = 0.9, #AdamOptimizer beta1 parameter
 	adam_beta2 = 0.999, #AdamOptimizer beta2 parameter
-	adam_epsilon = 10e-6, #AdamOptimizer beta3 parameter
+	adam_epsilon = 10**(-6), #AdamOptimizer beta3 parameter
 	zoneout_rate=0.1, #zoneout rate for all LSTM cells in the network
 	dropout_rate=0.5, #dropout rate for all convolutional layers + prenet
 

diff --git a/tacotron/models/attention.py b/tacotron/models/attention.py
@@ -1,7 +1,7 @@
 """Attention file for location based attention (compatible with tensorflow attention wrapper)"""
 
 import tensorflow as tf
-from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import _BaseMonotonicAttentionMechanism
+from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import _BaseAttentionMechanism
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.layers import core as layers_core
 from tensorflow.python.ops import variable_scope
@@ -37,6 +37,7 @@ def _location_sensitive_score(W_query, attention_weights, W_keys):
 	# location features [batch_size, max_time, filters]
 	f = tf.layers.conv1d(attention_weights, filters=32,
 		kernel_size=(31, ), padding='same',
+		kernel_initializer=tf.contrib.layers.xavier_initializer(),
 		name='location_features')
 
 	# Projected location features [batch_size, max_time, attention_dim]
@@ -49,12 +50,13 @@ def _location_sensitive_score(W_query, attention_weights, W_keys):
 		scope='W_filter')
 
 	v_a = tf.get_variable(
-		'v_a', shape=[num_units], dtype=tf.float32)
+		'v_a', shape=[num_units], dtype=tf.float32,
+		initializer=tf.contrib.layers.xavier_initializer())
 
 	return tf.reduce_sum(v_a * tf.tanh(W_keys + tf.expand_dims(W_query, axis=1) + W_fil), axis=2)
 
 
-class LocationSensitiveAttention(_BaseMonotonicAttentionMechanism):
+class LocationSensitiveAttention(_BaseAttentionMechanism):
 	"""Impelements Bahdanau-style (cumulative) scoring function.
 	Usually referred to as "hybrid" attention (content-based + location-based)
 	This attention is described in:

diff --git a/tacotron/models/dynamic_decoder.py b/tacotron/models/dynamic_decoder.py
@@ -224,12 +224,12 @@ def _maybe_copy_state(new, cur):
 		stop_token_loss = res[5]
 
 		#Average <stop_token> error over decoding steps
-		avg_stop_loss = stop_token_loss / steps
+		#avg_stop_loss = stop_token_loss / steps
 
 		final_outputs = nest.map_structure(
 			lambda ta: ta.stack(), final_outputs_ta)
 		if not output_time_major:
 			final_outputs = nest.map_structure(
 				_transpose_batch_time, final_outputs)
 
-	return final_outputs, final_state, avg_stop_loss
+	return final_outputs, final_state, stop_token_loss
diff --git a/tacotron/models/modules.py b/tacotron/models/modules.py
@@ -13,7 +13,8 @@ def conv1d(inputs, kernel_size, channels, activation, is_training, scope):
 			filters=channels,
 			kernel_size=kernel_size,
 			activation=activation,
-			padding='same')
+			padding='same',
+			kernel_initializer=tf.contrib.layers.xavier_initializer())
 		batched = tf.layers.batch_normalization(conv1d_output, training=is_training)
 		return tf.layers.dropout(batched, rate=drop_rate, training=is_training,
 		 						name='dropout_{}'.format(scope))
@@ -91,7 +92,9 @@ def prenet(inputs, is_training, layer_sizes=[128, 128], scope=None):
 
 	with tf.variable_scope(scope):
 		for i, size in enumerate(layer_sizes):
-			dense = tf.layers.dense(x, units=size, activation=tf.nn.relu, name='dense_{}'.format(i + 1))
+			dense = tf.layers.dense(x, units=size, activation=tf.nn.relu,
+				kernel_initializer=tf.contrib.layers.xavier_initializer(),
+				name='dense_{}'.format(i + 1))
 			#The paper discussed introducing diversity in generation at inference time
 			#by using a dropout of 0.5 only in prenet layers.
 			x = tf.layers.dropout(dense, rate=drop_rate, training=is_training, 
@@ -115,7 +118,7 @@ def stop_token_projection(x, shape=1, activation=lambda _: _, weights_name='stop
 	inference time for stop token prediction
 	"""
 
-	st_W = tf.get_variable(weights_name, shape=[x.shape[-1], 1], dtype=tf.float32, initializer=tf.truncated_normal_initializer())
+	st_W = tf.get_variable(weights_name, shape=[x.shape[-1], 1], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
 	st_b = tf.get_variable(bias_name, shape=[1], dtype=tf.float32, initializer=tf.zeros_initializer())
 
 	output = activation(tf.add(tf.matmul(x, st_W), st_b))

diff --git a/tacotron/models/rnn_wrappers.py b/tacotron/models/rnn_wrappers.py
@@ -97,35 +97,3 @@ def call(self, inputs, state):
 
   def zero_state(self, batch_size, dtype):
     return self._cell.zero_state(batch_size, dtype)
-
-
-# class LinearProjectionWrapper(RNNCell):
-#   """Operator adding an output projection to the given cell.
-#   This wrapper will perform a linear transformation with specified activation function.(Default to None)
-#   """
-#   def __init__(self, cell, projection_dim, activation=None):
-#     super(LinearProjectionWrapper, self).__init__()
-#     self._cell = cell
-#     self._projection_dim = projection_dim
-#     self._activation = activation
-
-#   @property
-#   def state_size(self):
-#     return self._cell.state_size
-
-#   @property
-#   def output_size(self):
-#     return self._projection_dim
-
-#   def zero_state(self, batch_size, dtype):
-#     with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
-#       return self._cell.zero_state(batch_size, dtype)
-
-#   def call(self, inputs, state):
-#     """Run the cell and output projection on inputs, starting from state."""
-#     output, res_state = self._cell(inputs, state)
-#     projected = projection(output, self._projection_dim)
-#     if self._activation:
-#       projected = self._activation(projected)
-
-#     return projected, res_state
diff --git a/tacotron/models/tacotron.py b/tacotron/models/tacotron.py
@@ -4,7 +4,7 @@
 from .helpers import TacoTrainingHelper, TacoTestHelper
 from .modules import *
 from models.zoneout_LSTM import ZoneoutLSTMCell
-from tensorflow.contrib.seq2seq import AttentionWrapper
+from tensorflow.contrib.seq2seq import AttentionWrapper, LuongAttention
 from .rnn_wrappers import *
 from tensorflow.contrib.rnn import MultiRNNCell, OutputProjectionWrapper
 from .attention import LocationSensitiveAttention
@@ -53,9 +53,9 @@ def initialize(self, inputs, input_lengths, mel_targets=None, gta=False):
 
 			#Attention
 			attention_cell = AttentionWrapper(
-				DecoderPrenetWrapper(ZoneoutLSTMCell(hp.attention_dim, is_training,
-												zoneout_factor_cell=hp.zoneout_rate,
-												zoneout_factor_output=hp.zoneout_rate), is_training),
+				DecoderPrenetWrapper(ZoneoutLSTMCell(hp.attention_dim, is_training, #Separate LSTM for attention mechanism
+					zoneout_factor_cell=hp.zoneout_rate,							#based on original tacotron architecture
+					zoneout_factor_output=hp.zoneout_rate), is_training),
 				LocationSensitiveAttention(hp.attention_dim, encoder_outputs),
 				alignment_history=True,
 				output_attention=False,
@@ -72,7 +72,7 @@ def initialize(self, inputs, input_lengths, mel_targets=None, gta=False):
 			#Concat LSTM output with context vector
 			concat_decoder_cell = ConcatLSTMOutputAndAttentionWrapper(decoder_cell)
 
-			#Projection to mel-spectrogram dimension (linear transformation)
+			#Projection to mel-spectrogram dimension (times number of outputs per step) (linear transformation)
 			output_cell = OutputProjectionWrapper(concat_decoder_cell, hp.num_mels * hp.outputs_per_step)
 
 			#Define the helper for our decoder
@@ -81,7 +81,7 @@ def initialize(self, inputs, input_lengths, mel_targets=None, gta=False):
 			else:
 				self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)
 
-			#We"ll only limit decoder time steps during inference (consult hparams.py to modify the value)
+			#We'll only limit decoder time steps during inference (consult hparams.py to modify the value)
 			max_iterations = None if is_training else hp.max_iters
 
 			#initial decoder state
@@ -90,15 +90,19 @@ def initialize(self, inputs, input_lengths, mel_targets=None, gta=False):
 			#Decode
 			(decoder_output, _), final_decoder_state, self.stop_token_loss = dynamic_decode(
 				CustomDecoder(output_cell, self.helper, decoder_init_state),
-				impute_finished=True, #Cut out padded parts
+				impute_finished=True, #Cut out padded parts (enabled)
 				maximum_iterations=max_iterations)
 
+			# Reshape outputs to be one output per entry 
+			decoder_output = tf.reshape(decoder_output, [batch_size, -1, hp.num_mels])
+
 			#Compute residual using post-net
 			residual = postnet(decoder_output, is_training,
 				kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels)
 
 			#Project residual to same dimension as mel spectrogram
-			projected_residual = projection(residual, shape=hp.num_mels,
+			projected_residual = projection(residual,
+				shape=hp.num_mels,
 				scope='residual_projection')
 
 			#Compute the mel spectrogram
@@ -136,7 +140,8 @@ def add_loss(self):
 			# Get all trainable variables
 			all_vars = tf.trainable_variables()
 			# Compute the regularization term
-			regularization = tf.add_n([tf.nn.l2_loss(v) for v in all_vars]) * hp.reg_weight
+			regularization = tf.add_n([tf.nn.l2_loss(v) for v in all_vars
+				if not('bias' in v.name or 'Bias' in v.name)]) * hp.reg_weight
 
 			# Compute final loss term
 			self.before_loss = before
@@ -173,15 +178,16 @@ def add_optimizer(self, global_step):
 					global_step=global_step)
 
 	def _learning_rate_decay(self, init_lr, global_step):
-		# Exponential decay starting after 50,000 iterations
+		# Exponential decay starting after 50,000 iterations (ignored for now)
 		# We won't drop learning rate below 10e-5
 		hp = self._hparams
 		step = tf.cast(global_step + 1, dtype=tf.float32)
-		if tf.greater(step, self.decay_steps) == True:
-			lr = tf.train.exponential_decay(init_lr, 
-											global_step - decay_steps + 1, 
-											self.decay_steps, 
-											self.decay_rate,
-											name='exponential_decay')
-			return max(hp.final_learning_rate, lr)
-		return init_lr
+		#Testing decaying rate since beginning (as the model seems to train faster than expected)
+		#if tf.greater(step, self.decay_steps) == True:
+		lr = tf.train.exponential_decay(init_lr, 
+										global_step - self.decay_steps + 1, 
+										self.decay_steps, 
+										self.decay_rate,
+										name='exponential_decay')
+		return tf.maximum(hp.final_learning_rate, lr)
+		#return init_lr
diff --git a/tacotron/models/zoneout_LSTM.py b/tacotron/models/zoneout_LSTM.py
@@ -22,7 +22,8 @@ class ZoneoutLSTMCell(RNNCell):
 
     def __init__(self, num_units, is_training, input_size=None,
                  use_peepholes=False, cell_clip=None,
-                 initializer=orthogonal_initializer(),
+                 #initializer=orthogonal_initializer(),
+                 initializer=tf.contrib.layers.xavier_initializer(),
                  num_proj=None, proj_clip=None, ext_proj=None,
                  forget_bias=1.0,
                  state_is_tuple=True,