Skip to content

Commit

Permalink
attention + typos + voice test
Browse files Browse the repository at this point in the history
  • Loading branch information
Rayhane-mamah authored Mar 10, 2018
1 parent be09c5a commit 7393fd5
Show file tree
Hide file tree
Showing 13 changed files with 194 additions and 113 deletions.
1 change: 0 additions & 1 deletion tacotron/datasets/feeder.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@ def _prepare_batch(batch, outputs_per_step):
inputs = _prepare_inputs([x[0] for x in batch])
input_lengths = np.asarray([len(x[0]) for x in batch], dtype=np.int32)
mel_targets = _prepare_targets([x[1] for x in batch], outputs_per_step)
#linear_targets = _prepare_targets([x[2] for x in batch], outputs_per_step)
return (inputs, input_lengths, mel_targets)

def _prepare_inputs(inputs):
Expand Down
5 changes: 1 addition & 4 deletions tacotron/datasets/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,9 @@ def _process_utterance(out_dir, index, wav_path, text):
# Load the audio as numpy array
wav = audio.load_wav(wav_path)

# Compute the linear-scale spectrogram from the wav to calculate n_frames
spectrogram = audio.spectrogram(wav).astype(np.float32)
n_frames = spectrogram.shape[1]

# Compute the mel scale spectrogram from the wav
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
n_frames = mel_spectrogram.shape[1]

# Write the spectrogram to disk
mel_filename = 'ljspeech-mel-{:05d}.npy'.format(index)
Expand Down
68 changes: 68 additions & 0 deletions tacotron/griffin_lim_synthesis_example.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"(660, 80)"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"from utils.audio import *\n",
"import os\n",
"\n",
"mel_folder = 'logs-Tacotron'\n",
"mel_file = 'ljspeech-mel-prediction-step-1400.npy'\n",
"\n",
"mel_file = os.path.join(mel_folder, mel_file) \n",
"mel_spectro = np.load(mel_file)\n",
"mel_spectro.shape"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"wav = inv_mel_spectrogram(mel_spectro.T)\n",
"save_wav(wav, 'wav_out/test.wav')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
16 changes: 9 additions & 7 deletions tacotron/hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@
ref_level_db=20,
fmin=125,
fmax=7600,
power=1.3,
griffin_lim_iters=60,

#Model
outputs_per_step = 1, #number of frames to generate at each decoding step
outputs_per_step = 5, #number of frames to generate at each decoding step (speeds up computation and allows for higher batch size)
embedding_dim = 512, #dimension of embedding space
enc_conv_num_layers=3, #number of encoder convolutional layers
enc_conv_kernel_size=(5, ), #size of encoder convolution filters for each layer
Expand All @@ -35,19 +37,19 @@
postnet_num_layers=5, #number of postnet convolutional layers
postnet_kernel_size=(5, ), #size of postnet convolution filters for each layer
postnet_channels=512, #number of postnet convolution filters for each layer
max_iters=810, #Max decoder steps during inference (feel free to change it)
max_iters=175, #Max decoder steps during inference (feel free to change it)

#Training
batch_size = 32, #number of training samples on each training steps
reg_weight = 10e-6, #regularization weight (for l2 regularization)
reg_weight = 10**(-6), #regularization weight (for l2 regularization)
decay_learning_rate = True, #boolean, determines if the learning rate will follow an exponential decay
decay_steps = 50000, #starting point for learning rate decay (and determines the decay slope)
decay_rate = 0.97, #learning rate decay rate
initial_learning_rate = 10e-3, #starting learning rate
final_learning_rate = 10e-5, #minimal learning rate
decay_rate = 0.4, #learning rate decay rate
initial_learning_rate = 10**(-3), #starting learning rate
final_learning_rate = 10**(-5), #minimal learning rate
adam_beta1 = 0.9, #AdamOptimizer beta1 parameter
adam_beta2 = 0.999, #AdamOptimizer beta2 parameter
adam_epsilon = 10e-6, #AdamOptimizer beta3 parameter
adam_epsilon = 10**(-6), #AdamOptimizer beta3 parameter
zoneout_rate=0.1, #zoneout rate for all LSTM cells in the network
dropout_rate=0.5, #dropout rate for all convolutional layers + prenet

Expand Down
8 changes: 5 additions & 3 deletions tacotron/models/attention.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Attention file for location based attention (compatible with tensorflow attention wrapper)"""

import tensorflow as tf
from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import _BaseMonotonicAttentionMechanism
from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import _BaseAttentionMechanism
from tensorflow.python.ops import nn_ops
from tensorflow.python.layers import core as layers_core
from tensorflow.python.ops import variable_scope
Expand Down Expand Up @@ -37,6 +37,7 @@ def _location_sensitive_score(W_query, attention_weights, W_keys):
# location features [batch_size, max_time, filters]
f = tf.layers.conv1d(attention_weights, filters=32,
kernel_size=(31, ), padding='same',
kernel_initializer=tf.contrib.layers.xavier_initializer(),
name='location_features')

# Projected location features [batch_size, max_time, attention_dim]
Expand All @@ -49,12 +50,13 @@ def _location_sensitive_score(W_query, attention_weights, W_keys):
scope='W_filter')

v_a = tf.get_variable(
'v_a', shape=[num_units], dtype=tf.float32)
'v_a', shape=[num_units], dtype=tf.float32,
initializer=tf.contrib.layers.xavier_initializer())

return tf.reduce_sum(v_a * tf.tanh(W_keys + tf.expand_dims(W_query, axis=1) + W_fil), axis=2)


class LocationSensitiveAttention(_BaseMonotonicAttentionMechanism):
class LocationSensitiveAttention(_BaseAttentionMechanism):
"""Impelements Bahdanau-style (cumulative) scoring function.
Usually referred to as "hybrid" attention (content-based + location-based)
This attention is described in:
Expand Down
4 changes: 2 additions & 2 deletions tacotron/models/dynamic_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,12 +224,12 @@ def _maybe_copy_state(new, cur):
stop_token_loss = res[5]

#Average <stop_token> error over decoding steps
avg_stop_loss = stop_token_loss / steps
#avg_stop_loss = stop_token_loss / steps

final_outputs = nest.map_structure(
lambda ta: ta.stack(), final_outputs_ta)
if not output_time_major:
final_outputs = nest.map_structure(
_transpose_batch_time, final_outputs)

return final_outputs, final_state, avg_stop_loss
return final_outputs, final_state, stop_token_loss
9 changes: 6 additions & 3 deletions tacotron/models/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ def conv1d(inputs, kernel_size, channels, activation, is_training, scope):
filters=channels,
kernel_size=kernel_size,
activation=activation,
padding='same')
padding='same',
kernel_initializer=tf.contrib.layers.xavier_initializer())
batched = tf.layers.batch_normalization(conv1d_output, training=is_training)
return tf.layers.dropout(batched, rate=drop_rate, training=is_training,
name='dropout_{}'.format(scope))
Expand Down Expand Up @@ -91,7 +92,9 @@ def prenet(inputs, is_training, layer_sizes=[128, 128], scope=None):

with tf.variable_scope(scope):
for i, size in enumerate(layer_sizes):
dense = tf.layers.dense(x, units=size, activation=tf.nn.relu, name='dense_{}'.format(i + 1))
dense = tf.layers.dense(x, units=size, activation=tf.nn.relu,
kernel_initializer=tf.contrib.layers.xavier_initializer(),
name='dense_{}'.format(i + 1))
#The paper discussed introducing diversity in generation at inference time
#by using a dropout of 0.5 only in prenet layers.
x = tf.layers.dropout(dense, rate=drop_rate, training=is_training,
Expand All @@ -115,7 +118,7 @@ def stop_token_projection(x, shape=1, activation=lambda _: _, weights_name='stop
inference time for stop token prediction
"""

st_W = tf.get_variable(weights_name, shape=[x.shape[-1], 1], dtype=tf.float32, initializer=tf.truncated_normal_initializer())
st_W = tf.get_variable(weights_name, shape=[x.shape[-1], 1], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
st_b = tf.get_variable(bias_name, shape=[1], dtype=tf.float32, initializer=tf.zeros_initializer())

output = activation(tf.add(tf.matmul(x, st_W), st_b))
Expand Down
32 changes: 0 additions & 32 deletions tacotron/models/rnn_wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,35 +97,3 @@ def call(self, inputs, state):

def zero_state(self, batch_size, dtype):
return self._cell.zero_state(batch_size, dtype)


# class LinearProjectionWrapper(RNNCell):
# """Operator adding an output projection to the given cell.
# This wrapper will perform a linear transformation with specified activation function.(Default to None)
# """
# def __init__(self, cell, projection_dim, activation=None):
# super(LinearProjectionWrapper, self).__init__()
# self._cell = cell
# self._projection_dim = projection_dim
# self._activation = activation

# @property
# def state_size(self):
# return self._cell.state_size

# @property
# def output_size(self):
# return self._projection_dim

# def zero_state(self, batch_size, dtype):
# with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
# return self._cell.zero_state(batch_size, dtype)

# def call(self, inputs, state):
# """Run the cell and output projection on inputs, starting from state."""
# output, res_state = self._cell(inputs, state)
# projected = projection(output, self._projection_dim)
# if self._activation:
# projected = self._activation(projected)

# return projected, res_state
42 changes: 24 additions & 18 deletions tacotron/models/tacotron.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from .helpers import TacoTrainingHelper, TacoTestHelper
from .modules import *
from models.zoneout_LSTM import ZoneoutLSTMCell
from tensorflow.contrib.seq2seq import AttentionWrapper
from tensorflow.contrib.seq2seq import AttentionWrapper, LuongAttention
from .rnn_wrappers import *
from tensorflow.contrib.rnn import MultiRNNCell, OutputProjectionWrapper
from .attention import LocationSensitiveAttention
Expand Down Expand Up @@ -53,9 +53,9 @@ def initialize(self, inputs, input_lengths, mel_targets=None, gta=False):

#Attention
attention_cell = AttentionWrapper(
DecoderPrenetWrapper(ZoneoutLSTMCell(hp.attention_dim, is_training,
zoneout_factor_cell=hp.zoneout_rate,
zoneout_factor_output=hp.zoneout_rate), is_training),
DecoderPrenetWrapper(ZoneoutLSTMCell(hp.attention_dim, is_training, #Separate LSTM for attention mechanism
zoneout_factor_cell=hp.zoneout_rate, #based on original tacotron architecture
zoneout_factor_output=hp.zoneout_rate), is_training),
LocationSensitiveAttention(hp.attention_dim, encoder_outputs),
alignment_history=True,
output_attention=False,
Expand All @@ -72,7 +72,7 @@ def initialize(self, inputs, input_lengths, mel_targets=None, gta=False):
#Concat LSTM output with context vector
concat_decoder_cell = ConcatLSTMOutputAndAttentionWrapper(decoder_cell)

#Projection to mel-spectrogram dimension (linear transformation)
#Projection to mel-spectrogram dimension (times number of outputs per step) (linear transformation)
output_cell = OutputProjectionWrapper(concat_decoder_cell, hp.num_mels * hp.outputs_per_step)

#Define the helper for our decoder
Expand All @@ -81,7 +81,7 @@ def initialize(self, inputs, input_lengths, mel_targets=None, gta=False):
else:
self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)

#We"ll only limit decoder time steps during inference (consult hparams.py to modify the value)
#We'll only limit decoder time steps during inference (consult hparams.py to modify the value)
max_iterations = None if is_training else hp.max_iters

#initial decoder state
Expand All @@ -90,15 +90,19 @@ def initialize(self, inputs, input_lengths, mel_targets=None, gta=False):
#Decode
(decoder_output, _), final_decoder_state, self.stop_token_loss = dynamic_decode(
CustomDecoder(output_cell, self.helper, decoder_init_state),
impute_finished=True, #Cut out padded parts
impute_finished=True, #Cut out padded parts (enabled)
maximum_iterations=max_iterations)

# Reshape outputs to be one output per entry
decoder_output = tf.reshape(decoder_output, [batch_size, -1, hp.num_mels])

#Compute residual using post-net
residual = postnet(decoder_output, is_training,
kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels)

#Project residual to same dimension as mel spectrogram
projected_residual = projection(residual, shape=hp.num_mels,
projected_residual = projection(residual,
shape=hp.num_mels,
scope='residual_projection')

#Compute the mel spectrogram
Expand Down Expand Up @@ -136,7 +140,8 @@ def add_loss(self):
# Get all trainable variables
all_vars = tf.trainable_variables()
# Compute the regularization term
regularization = tf.add_n([tf.nn.l2_loss(v) for v in all_vars]) * hp.reg_weight
regularization = tf.add_n([tf.nn.l2_loss(v) for v in all_vars
if not('bias' in v.name or 'Bias' in v.name)]) * hp.reg_weight

# Compute final loss term
self.before_loss = before
Expand Down Expand Up @@ -173,15 +178,16 @@ def add_optimizer(self, global_step):
global_step=global_step)

def _learning_rate_decay(self, init_lr, global_step):
# Exponential decay starting after 50,000 iterations
# Exponential decay starting after 50,000 iterations (ignored for now)
# We won't drop learning rate below 10e-5
hp = self._hparams
step = tf.cast(global_step + 1, dtype=tf.float32)
if tf.greater(step, self.decay_steps) == True:
lr = tf.train.exponential_decay(init_lr,
global_step - decay_steps + 1,
self.decay_steps,
self.decay_rate,
name='exponential_decay')
return max(hp.final_learning_rate, lr)
return init_lr
#Testing decaying rate since beginning (as the model seems to train faster than expected)
#if tf.greater(step, self.decay_steps) == True:
lr = tf.train.exponential_decay(init_lr,
global_step - self.decay_steps + 1,
self.decay_steps,
self.decay_rate,
name='exponential_decay')
return tf.maximum(hp.final_learning_rate, lr)
#return init_lr
3 changes: 2 additions & 1 deletion tacotron/models/zoneout_LSTM.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ class ZoneoutLSTMCell(RNNCell):

def __init__(self, num_units, is_training, input_size=None,
use_peepholes=False, cell_clip=None,
initializer=orthogonal_initializer(),
#initializer=orthogonal_initializer(),
initializer=tf.contrib.layers.xavier_initializer(),
num_proj=None, proj_clip=None, ext_proj=None,
forget_bias=1.0,
state_is_tuple=True,
Expand Down
Loading

0 comments on commit 7393fd5

Please sign in to comment.