Code reorder + Architecture review

I reorganized the majority of the code to facilitate modifications and reviewed the network architecture to increase its fidelity to deep mind's paper.
Rayhane-mamah · Mar 15, 2018 · 919c96a · 919c96a
1 parent 2f3d655
commit 919c96a
Show file tree

Hide file tree

Showing 11 changed files with 538 additions and 203 deletions.
diff --git a/tacotron/datasets/feeder.py b/tacotron/datasets/feeder.py
@@ -10,6 +10,7 @@
 
 _batches_per_group = 32
 _pad = 0
+_token_pad = 1.
 
 class Feeder(threading.Thread):
 	"""
@@ -36,17 +37,17 @@ def __init__(self, coordinator, metadata_filename, hparams):
 		tf.placeholder(tf.int32, shape=(None, None), name='inputs'),
 		tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'),
 		tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'),
-		#tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets')
+		tf.placeholder(tf.float32, shape=(None, None), name='token_targets'),
 		]
 
 		# Create queue for buffering data
-		queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32], name='input_queue')
+		queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32], name='input_queue')
 		self._enqueue_op = queue.enqueue(self._placeholders)
-		self.inputs, self.input_lengths, self.mel_targets = queue.dequeue()
+		self.inputs, self.input_lengths, self.mel_targets, self.token_targets = queue.dequeue()
 		self.inputs.set_shape(self._placeholders[0].shape)
 		self.input_lengths.set_shape(self._placeholders[1].shape)
 		self.mel_targets.set_shape(self._placeholders[2].shape)
-		#self._linear_targets.set_shape(self._placeholders[3].shape)
+		self.token_targets.set_shape(self._placeholders[3].shape)
 
 	def start_in_session(self, session):
 		self._session = session
@@ -92,15 +93,17 @@ def _get_next_example(self):
 
 		input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
 		mel_target = np.load(os.path.join(self._datadir, meta[0]))
-		return (input_data, mel_target, len(mel_target))
+		token_target = np.asarray([0.] * len(mel_target))
+		return (input_data, mel_target, token_target, len(mel_target))
 
 
 def _prepare_batch(batch, outputs_per_step):
 	np.random.shuffle(batch)
 	inputs = _prepare_inputs([x[0] for x in batch])
 	input_lengths = np.asarray([len(x[0]) for x in batch], dtype=np.int32)
 	mel_targets = _prepare_targets([x[1] for x in batch], outputs_per_step)
-	return (inputs, input_lengths, mel_targets)
+	token_targets = _prepare_token_targets([x[2] for x in batch], outputs_per_step)
+	return (inputs, input_lengths, mel_targets, token_targets)
 
 def _prepare_inputs(inputs):
 	max_len = max([len(x) for x in inputs])
@@ -110,12 +113,19 @@ def _prepare_targets(targets, alignment):
 	max_len = max([len(t) for t in targets]) + 1 
 	return np.stack([_pad_target(t, _round_up(max_len, alignment)) for t in targets])
 
+def _prepare_token_targets(targets, alignment):
+	max_len = max([len(t) for t in targets]) + 1
+	return np.stack([_pad_token_target(t, _round_up(max_len, alignment)) for t in targets])
+
 def _pad_input(x, length):
 	return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
 
 def _pad_target(t, length):
 	return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=_pad)
 
+def _pad_token_target(t, length):
+	return np.pad(t, (0, length - t.shape[0]), mode='constant', constant_values=_token_pad)
+
 def _round_up(x, multiple):
 	remainder = x % multiple
 	return x if remainder == 0 else x + multiple - remainder
diff --git a/tacotron/griffin_lim_synthesis_example.ipynb b/tacotron/griffin_lim_synthesis_example.ipynb
@@ -4,6 +4,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
+    "collapsed": true,
     "scrolled": true
    },
    "outputs": [],
@@ -12,7 +13,7 @@
     "from utils.audio import *\n",
     "import os\n",
     "\n",
-    "n_sample = 5100 #Change n_steps here\n",
+    "n_sample = 0 #Change n_steps here\n",
     "mel_folder = 'logs-Tacotron' #Or change file path\n",
     "mel_file = 'ljspeech-mel-prediction-step-{}.npy'.format(n_sample) #Or file name (for other generated mels)\n",
     "out_dir = 'wav_out'\n",

diff --git a/tacotron/hparams.py b/tacotron/hparams.py
@@ -23,7 +23,7 @@
 	ref_level_db=20,
 	fmin=125,
 	fmax=7600,
-	
+
 	power=1.3,
 	griffin_lim_iters=60,
 
@@ -41,7 +41,7 @@
 	attention_filters = 20, #number of attention convolution filters
 	attention_kernel = (7, ), #kernel size of attention convolution
 
-	prenet_layers=[256, 128], #number of layers and number of units of prenet
+	prenet_layers=[256, 256], #number of layers and number of units of prenet
 	decoder_layers=2, #number of decoder lstm layers
 	decoder_lstm_units=1024, #number of decoder lstm units on each layer
 	max_iters=175, #Max decoder steps during inference (feel free to change it)

diff --git a/tacotron/models/Architecture_wrappers.py b/tacotron/models/Architecture_wrappers.py
@@ -0,0 +1,189 @@
+"""A set of wrappers usefull for tacotron 2 architecture
+All notations and variable names were used in concordance with originial tensorflow implementation
+"""
+import collections
+import numpy as np
+import tensorflow as tf
+from tensorflow.contrib.rnn import RNNCell
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import check_ops
+from tensorflow.python.util import nest
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.framework import tensor_shape
+
+
+_zero_state_tensors = rnn_cell_impl._zero_state_tensors
+
+
+
+class TacotronEncoderCell(RNNCell):
+	"""Tacotron 2 Encoder Cell
+	Passes inputs through a stack of convolutional layers then through a bidirectional LSTM
+	layer to predict the hidden representation vector (or memory)
+	"""
+
+	def __init__(self, convolutional_layers, lstm_layer):
+		"""Initialize encoder parameters
+
+		Args:
+			convolutional_layers: Encoder convolutional block class
+			lstm_layer: encoder bidirectional lstm layer class
+		"""
+		super(TacotronEncoderCell, self).__init__()
+		#Initialize encoder layers
+		self._convolutions = convolutional_layers
+		self._cell = lstm_layer
+
+	def __call__(self, inputs, input_lengths):
+		#Pass input sequence through a stack of convolutional layers
+		conv_output = self._convolutions(inputs)
+
+		#Extract hidden representation from encoder lstm cells
+		hidden_representation = self._cell(conv_output, input_lengths)
+
+		#For shape visualization
+		self.conv_output_shape = conv_output.shape
+		return hidden_representation
+
+
+class TacotronDecoderCellState(
+	collections.namedtuple("TacotronDecoderCellState",
+	 ("cell_state", "attention", "time", "alignments",
+	  "alignment_history"))):
+	"""`namedtuple` storing the state of a `TacotronDecoderCell`.
+	Contains:
+	  - `cell_state`: The state of the wrapped `RNNCell` at the previous time
+		step.
+	  - `attention`: The attention emitted at the previous time step.
+	  - `time`: int32 scalar containing the current time step.
+	  - `alignments`: A single or tuple of `Tensor`(s) containing the alignments
+		 emitted at the previous time step for each attention mechanism.
+	  - `alignment_history`: a single or tuple of `TensorArray`(s)
+		 containing alignment matrices from all time steps for each attention
+		 mechanism. Call `stack()` on each to convert to a `Tensor`.
+	"""
+	pass
+
+class TacotronDecoderCell(RNNCell):
+	"""Tactron 2 Decoder Cell
+	Decodes encoder output and previous mel frames into next r frames
+	"""
+
+	def __init__(self, prenet, attention_mechanism, rnn_cell, frame_projection, stop_projection):
+		"""Initialize decoder parameters
+
+		Args:
+		  prenet: A tensorflow fully connected layer acting as the decoder pre-net
+		  attention_mechanism: A _BaseAttentionMechanism instance, usefull to 
+			learn encoder-decoder alignments
+		  rnn_cell: Instance of RNNCell, main body of the decoder
+		  frame_projection: tensorflow fully connected layer with r * num_mels output units
+		  stop_projection: tensorflo fully connected layer, expected to project to a scalar 
+			and through a sigmoid activation
+		"""
+		super(TacotronDecoderCell, self).__init__()
+		#Initialize decoder layers
+		self._prenet = prenet
+		self._attention_mechanism = attention_mechanism
+		self._cell = rnn_cell
+		self._frame_projection = frame_projection
+		self._stop_projection = stop_projection
+
+		self._attention_layer_size = self._attention_mechanism.values.get_shape()[-1].value
+
+	def _batch_size_checks(self, batch_size, error_message):
+		return [check_ops.assert_equal(batch_size,
+		  self._attention_mechanism.batch_size,
+		  message=error_message)]
+
+	@property
+	def output_size(self):
+		return self._frame_projection.shape
+
+	@property
+	def state_size(self):
+		"""The `state_size` property of `TacotronDecoderCell`.
+
+		Returns:
+		  An `TacotronDecoderCell` tuple containing shapes used by this object.
+		"""
+		return TacotronDecoderCellState(
+			cell_state=self._cell._cell.state_size,
+			time=tensor_shape.TensorShape([]),
+			attention=self._attention_layer_size,
+			alignments=self._attention_mechanism.alignments_size,
+			alignment_history=())
+
+	def zero_state(self, batch_size, dtype):
+		"""Return an initial (zero) state tuple for this `AttentionWrapper`.
+		
+		Args:
+		  batch_size: `0D` integer tensor: the batch size.
+		  dtype: The internal state data type.
+		Returns:
+		  An `TacotronDecoderCellState` tuple containing zeroed out tensors and,
+		  possibly, empty `TensorArray` objects.
+		Raises:
+		  ValueError: (or, possibly at runtime, InvalidArgument), if
+			`batch_size` does not match the output size of the encoder passed
+			to the wrapper object at initialization time.
+		"""
+		with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+			cell_state = self._cell._cell.zero_state(batch_size, dtype)
+			error_message = (
+				"When calling zero_state of TacotronDecoderCell %s: " % self._base_name +
+				"Non-matching batch sizes between the memory "
+				"(encoder output) and the requested batch size.")
+			with ops.control_dependencies(
+				self._batch_size_checks(batch_size, error_message)):
+				cell_state = nest.map_structure(
+					lambda s: array_ops.identity(s, name="checked_cell_state"),
+					cell_state)
+			return TacotronDecoderCellState(
+				cell_state=cell_state,
+				time=array_ops.zeros([], dtype=tf.int32),
+				attention=_zero_state_tensors(self._attention_layer_size, batch_size,
+				  dtype),
+				alignments=self._attention_mechanism.initial_alignments(batch_size, dtype),
+				alignment_history=tensor_array_ops.TensorArray(dtype=dtype, size=0,
+				dynamic_size=True))
+
+	def call(self, inputs, state):
+		#Pass the previously predicted frame through the prenet
+		prenet_output = self._prenet(inputs)
+
+		#Compute the attention (context) vector and alignments using
+		#first decoder hidden state as query vector and previous alignments
+		#to extract location features
+		first_rnn_state, last_rnn_state = state.cell_state
+		previous_alignments = state.alignments
+		previous_alignment_history = state.alignment_history
+		context_vector, alignments = self._attention_mechanism(first_rnn_state.h, previous_alignments)
+
+		#Concat context vector and prenet output to form LSTM cells input
+		LSTM_input = tf.concat([prenet_output, context_vector], axis=-1)
+
+		#Unidirectional LSTM layers
+		LSTM_output, next_cell_state = self._cell(LSTM_input, state.cell_state)
+
+		#Concat LSTM outputs and context vector to form projections inputs
+		projections_input = tf.concat([LSTM_output, context_vector], axis=-1)
+
+		#Compute predicted frames and predicted <stop_token>
+		cell_outputs = self._frame_projection(projections_input)
+		stop_tokens = self._stop_projection(projections_input)
+
+		#Save alignment history
+		alignment_history = previous_alignment_history.write(state.time, alignments)
+
+		#Prepare next decoder state
+		next_state = TacotronDecoderCellState(
+			time=state.time + 1,
+			cell_state=next_cell_state,
+			attention=context_vector,
+			alignments=alignments,
+			alignment_history=alignment_history)
+
+		return (cell_outputs, stop_tokens), next_state 
diff --git a/tacotron/models/attention.py b/tacotron/models/attention.py
@@ -4,7 +4,9 @@
 from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import _BaseAttentionMechanism
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.layers import core as layers_core
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import math_ops
 from hparams import hparams
 
 
@@ -109,7 +111,7 @@ def __init__(self,
 		self._num_units = num_units
 		self._name = name
 
-	def __call__(self, query, state):
+	def get_alignments(self, query, previous_alignments):
 		"""Score the query based on the keys and values.
 		Args:
 			query: Tensor of dtype matching `self.values` and shape
@@ -122,14 +124,34 @@ def __call__(self, query, state):
 				`[batch_size, alignments_size]` (`alignments_size` is memory's
 				`max_time`).
 		"""
-		previous_alignments = state
 		with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]):
 			# processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim]
 			processed_query = self.query_layer(query) if self.query_layer else query
 			# energy shape [batch_size, max_time]
 			energy = _location_sensitive_score(processed_query, previous_alignments, self._keys)
 		# alignments shape = energy shape = [batch_size, max_time]
 		alignments = self._probability_fn(energy, previous_alignments)
-		#Seems pretty useless but tensorflow attention wrapper requires it to work properly
-		next_state = alignments
-		return alignments, next_state
+		return alignments
+
+
+	def __call__(self, query_vector, previous_alignments):
+		"""Computes the context vector and alignments.
+		"""
+		alignments = self.get_alignments(query_vector, previous_alignments)
+
+		# Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
+		expanded_alignments = array_ops.expand_dims(alignments, 1)
+
+		# Context is the inner product of alignments and values along the
+		# memory time dimension.
+		# alignments shape is
+		#   [batch_size, 1, memory_time]
+		# attention_mechanism.values shape is
+		#   [batch_size, memory_time, memory_size]
+		# the batched matmul is over memory_time, so the output shape is
+		#   [batch_size, 1, memory_size].
+		# we then squeeze out the singleton dim.
+		context = math_ops.matmul(expanded_alignments, self.values)
+		context = array_ops.squeeze(context, [1])
+
+		return context, alignments