Skip to content

Commit

Permalink
Code reorder + Architecture review
Browse files Browse the repository at this point in the history
I reorganized the majority of the code to facilitate modifications and reviewed the network architecture to increase its fidelity to deep mind's paper.
  • Loading branch information
Rayhane-mamah authored Mar 15, 2018
1 parent 2f3d655 commit 919c96a
Show file tree
Hide file tree
Showing 11 changed files with 538 additions and 203 deletions.
22 changes: 16 additions & 6 deletions tacotron/datasets/feeder.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

_batches_per_group = 32
_pad = 0
_token_pad = 1.

class Feeder(threading.Thread):
"""
Expand All @@ -36,17 +37,17 @@ def __init__(self, coordinator, metadata_filename, hparams):
tf.placeholder(tf.int32, shape=(None, None), name='inputs'),
tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'),
tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'),
#tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets')
tf.placeholder(tf.float32, shape=(None, None), name='token_targets'),
]

# Create queue for buffering data
queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32], name='input_queue')
queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32], name='input_queue')
self._enqueue_op = queue.enqueue(self._placeholders)
self.inputs, self.input_lengths, self.mel_targets = queue.dequeue()
self.inputs, self.input_lengths, self.mel_targets, self.token_targets = queue.dequeue()
self.inputs.set_shape(self._placeholders[0].shape)
self.input_lengths.set_shape(self._placeholders[1].shape)
self.mel_targets.set_shape(self._placeholders[2].shape)
#self._linear_targets.set_shape(self._placeholders[3].shape)
self.token_targets.set_shape(self._placeholders[3].shape)

def start_in_session(self, session):
self._session = session
Expand Down Expand Up @@ -92,15 +93,17 @@ def _get_next_example(self):

input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
mel_target = np.load(os.path.join(self._datadir, meta[0]))
return (input_data, mel_target, len(mel_target))
token_target = np.asarray([0.] * len(mel_target))
return (input_data, mel_target, token_target, len(mel_target))


def _prepare_batch(batch, outputs_per_step):
np.random.shuffle(batch)
inputs = _prepare_inputs([x[0] for x in batch])
input_lengths = np.asarray([len(x[0]) for x in batch], dtype=np.int32)
mel_targets = _prepare_targets([x[1] for x in batch], outputs_per_step)
return (inputs, input_lengths, mel_targets)
token_targets = _prepare_token_targets([x[2] for x in batch], outputs_per_step)
return (inputs, input_lengths, mel_targets, token_targets)

def _prepare_inputs(inputs):
max_len = max([len(x) for x in inputs])
Expand All @@ -110,12 +113,19 @@ def _prepare_targets(targets, alignment):
max_len = max([len(t) for t in targets]) + 1
return np.stack([_pad_target(t, _round_up(max_len, alignment)) for t in targets])

def _prepare_token_targets(targets, alignment):
max_len = max([len(t) for t in targets]) + 1
return np.stack([_pad_token_target(t, _round_up(max_len, alignment)) for t in targets])

def _pad_input(x, length):
return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)

def _pad_target(t, length):
return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=_pad)

def _pad_token_target(t, length):
return np.pad(t, (0, length - t.shape[0]), mode='constant', constant_values=_token_pad)

def _round_up(x, multiple):
remainder = x % multiple
return x if remainder == 0 else x + multiple - remainder
3 changes: 2 additions & 1 deletion tacotron/griffin_lim_synthesis_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"scrolled": true
},
"outputs": [],
Expand All @@ -12,7 +13,7 @@
"from utils.audio import *\n",
"import os\n",
"\n",
"n_sample = 5100 #Change n_steps here\n",
"n_sample = 0 #Change n_steps here\n",
"mel_folder = 'logs-Tacotron' #Or change file path\n",
"mel_file = 'ljspeech-mel-prediction-step-{}.npy'.format(n_sample) #Or file name (for other generated mels)\n",
"out_dir = 'wav_out'\n",
Expand Down
4 changes: 2 additions & 2 deletions tacotron/hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
ref_level_db=20,
fmin=125,
fmax=7600,

power=1.3,
griffin_lim_iters=60,

Expand All @@ -41,7 +41,7 @@
attention_filters = 20, #number of attention convolution filters
attention_kernel = (7, ), #kernel size of attention convolution

prenet_layers=[256, 128], #number of layers and number of units of prenet
prenet_layers=[256, 256], #number of layers and number of units of prenet
decoder_layers=2, #number of decoder lstm layers
decoder_lstm_units=1024, #number of decoder lstm units on each layer
max_iters=175, #Max decoder steps during inference (feel free to change it)
Expand Down
189 changes: 189 additions & 0 deletions tacotron/models/Architecture_wrappers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
"""A set of wrappers usefull for tacotron 2 architecture
All notations and variable names were used in concordance with originial tensorflow implementation
"""
import collections
import numpy as np
import tensorflow as tf
from tensorflow.contrib.rnn import RNNCell
from tensorflow.python.framework import ops
from tensorflow.python.ops import rnn_cell_impl
from tensorflow.python.ops import check_ops
from tensorflow.python.util import nest
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import tensor_array_ops
from tensorflow.python.framework import tensor_shape


_zero_state_tensors = rnn_cell_impl._zero_state_tensors



class TacotronEncoderCell(RNNCell):
"""Tacotron 2 Encoder Cell
Passes inputs through a stack of convolutional layers then through a bidirectional LSTM
layer to predict the hidden representation vector (or memory)
"""

def __init__(self, convolutional_layers, lstm_layer):
"""Initialize encoder parameters
Args:
convolutional_layers: Encoder convolutional block class
lstm_layer: encoder bidirectional lstm layer class
"""
super(TacotronEncoderCell, self).__init__()
#Initialize encoder layers
self._convolutions = convolutional_layers
self._cell = lstm_layer

def __call__(self, inputs, input_lengths):
#Pass input sequence through a stack of convolutional layers
conv_output = self._convolutions(inputs)

#Extract hidden representation from encoder lstm cells
hidden_representation = self._cell(conv_output, input_lengths)

#For shape visualization
self.conv_output_shape = conv_output.shape
return hidden_representation


class TacotronDecoderCellState(
collections.namedtuple("TacotronDecoderCellState",
("cell_state", "attention", "time", "alignments",
"alignment_history"))):
"""`namedtuple` storing the state of a `TacotronDecoderCell`.
Contains:
- `cell_state`: The state of the wrapped `RNNCell` at the previous time
step.
- `attention`: The attention emitted at the previous time step.
- `time`: int32 scalar containing the current time step.
- `alignments`: A single or tuple of `Tensor`(s) containing the alignments
emitted at the previous time step for each attention mechanism.
- `alignment_history`: a single or tuple of `TensorArray`(s)
containing alignment matrices from all time steps for each attention
mechanism. Call `stack()` on each to convert to a `Tensor`.
"""
pass

class TacotronDecoderCell(RNNCell):
"""Tactron 2 Decoder Cell
Decodes encoder output and previous mel frames into next r frames
"""

def __init__(self, prenet, attention_mechanism, rnn_cell, frame_projection, stop_projection):
"""Initialize decoder parameters
Args:
prenet: A tensorflow fully connected layer acting as the decoder pre-net
attention_mechanism: A _BaseAttentionMechanism instance, usefull to
learn encoder-decoder alignments
rnn_cell: Instance of RNNCell, main body of the decoder
frame_projection: tensorflow fully connected layer with r * num_mels output units
stop_projection: tensorflo fully connected layer, expected to project to a scalar
and through a sigmoid activation
"""
super(TacotronDecoderCell, self).__init__()
#Initialize decoder layers
self._prenet = prenet
self._attention_mechanism = attention_mechanism
self._cell = rnn_cell
self._frame_projection = frame_projection
self._stop_projection = stop_projection

self._attention_layer_size = self._attention_mechanism.values.get_shape()[-1].value

def _batch_size_checks(self, batch_size, error_message):
return [check_ops.assert_equal(batch_size,
self._attention_mechanism.batch_size,
message=error_message)]

@property
def output_size(self):
return self._frame_projection.shape

@property
def state_size(self):
"""The `state_size` property of `TacotronDecoderCell`.
Returns:
An `TacotronDecoderCell` tuple containing shapes used by this object.
"""
return TacotronDecoderCellState(
cell_state=self._cell._cell.state_size,
time=tensor_shape.TensorShape([]),
attention=self._attention_layer_size,
alignments=self._attention_mechanism.alignments_size,
alignment_history=())

def zero_state(self, batch_size, dtype):
"""Return an initial (zero) state tuple for this `AttentionWrapper`.
Args:
batch_size: `0D` integer tensor: the batch size.
dtype: The internal state data type.
Returns:
An `TacotronDecoderCellState` tuple containing zeroed out tensors and,
possibly, empty `TensorArray` objects.
Raises:
ValueError: (or, possibly at runtime, InvalidArgument), if
`batch_size` does not match the output size of the encoder passed
to the wrapper object at initialization time.
"""
with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
cell_state = self._cell._cell.zero_state(batch_size, dtype)
error_message = (
"When calling zero_state of TacotronDecoderCell %s: " % self._base_name +
"Non-matching batch sizes between the memory "
"(encoder output) and the requested batch size.")
with ops.control_dependencies(
self._batch_size_checks(batch_size, error_message)):
cell_state = nest.map_structure(
lambda s: array_ops.identity(s, name="checked_cell_state"),
cell_state)
return TacotronDecoderCellState(
cell_state=cell_state,
time=array_ops.zeros([], dtype=tf.int32),
attention=_zero_state_tensors(self._attention_layer_size, batch_size,
dtype),
alignments=self._attention_mechanism.initial_alignments(batch_size, dtype),
alignment_history=tensor_array_ops.TensorArray(dtype=dtype, size=0,
dynamic_size=True))

def call(self, inputs, state):
#Pass the previously predicted frame through the prenet
prenet_output = self._prenet(inputs)

#Compute the attention (context) vector and alignments using
#first decoder hidden state as query vector and previous alignments
#to extract location features
first_rnn_state, last_rnn_state = state.cell_state
previous_alignments = state.alignments
previous_alignment_history = state.alignment_history
context_vector, alignments = self._attention_mechanism(first_rnn_state.h, previous_alignments)

#Concat context vector and prenet output to form LSTM cells input
LSTM_input = tf.concat([prenet_output, context_vector], axis=-1)

#Unidirectional LSTM layers
LSTM_output, next_cell_state = self._cell(LSTM_input, state.cell_state)

#Concat LSTM outputs and context vector to form projections inputs
projections_input = tf.concat([LSTM_output, context_vector], axis=-1)

#Compute predicted frames and predicted <stop_token>
cell_outputs = self._frame_projection(projections_input)
stop_tokens = self._stop_projection(projections_input)

#Save alignment history
alignment_history = previous_alignment_history.write(state.time, alignments)

#Prepare next decoder state
next_state = TacotronDecoderCellState(
time=state.time + 1,
cell_state=next_cell_state,
attention=context_vector,
alignments=alignments,
alignment_history=alignment_history)

return (cell_outputs, stop_tokens), next_state
32 changes: 27 additions & 5 deletions tacotron/models/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import _BaseAttentionMechanism
from tensorflow.python.ops import nn_ops
from tensorflow.python.layers import core as layers_core
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import variable_scope
from tensorflow.python.ops import math_ops
from hparams import hparams


Expand Down Expand Up @@ -109,7 +111,7 @@ def __init__(self,
self._num_units = num_units
self._name = name

def __call__(self, query, state):
def get_alignments(self, query, previous_alignments):
"""Score the query based on the keys and values.
Args:
query: Tensor of dtype matching `self.values` and shape
Expand All @@ -122,14 +124,34 @@ def __call__(self, query, state):
`[batch_size, alignments_size]` (`alignments_size` is memory's
`max_time`).
"""
previous_alignments = state
with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]):
# processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim]
processed_query = self.query_layer(query) if self.query_layer else query
# energy shape [batch_size, max_time]
energy = _location_sensitive_score(processed_query, previous_alignments, self._keys)
# alignments shape = energy shape = [batch_size, max_time]
alignments = self._probability_fn(energy, previous_alignments)
#Seems pretty useless but tensorflow attention wrapper requires it to work properly
next_state = alignments
return alignments, next_state
return alignments


def __call__(self, query_vector, previous_alignments):
"""Computes the context vector and alignments.
"""
alignments = self.get_alignments(query_vector, previous_alignments)

# Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
expanded_alignments = array_ops.expand_dims(alignments, 1)

# Context is the inner product of alignments and values along the
# memory time dimension.
# alignments shape is
# [batch_size, 1, memory_time]
# attention_mechanism.values shape is
# [batch_size, memory_time, memory_size]
# the batched matmul is over memory_time, so the output shape is
# [batch_size, 1, memory_size].
# we then squeeze out the singleton dim.
context = math_ops.matmul(expanded_alignments, self.values)
context = array_ops.squeeze(context, [1])

return context, alignments
Loading

0 comments on commit 919c96a

Please sign in to comment.