Skip to content

Commit

Permalink
Merge pull request #11 from Flux9665/multi_lingual_multi_speaker
Browse files Browse the repository at this point in the history
fix language ID not being used properly
  • Loading branch information
Flux9665 committed Mar 19, 2022
2 parents 97c9006 + 944cede commit 81075a6
Show file tree
Hide file tree
Showing 27 changed files with 1,260 additions and 1,944 deletions.
45 changes: 36 additions & 9 deletions InferenceInterfaces/InferenceFastSpeech2.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import librosa.display as lbd
import matplotlib.pyplot as plt
import noisereduce
import sounddevice
import soundfile
import torch
Expand All @@ -16,45 +17,65 @@

class InferenceFastSpeech2(torch.nn.Module):

def __init__(self, device="cpu", model_name="Meta", language="en"):
def __init__(self, device="cpu", model_name="Meta", language="en", noise_reduce=False):
super().__init__()
self.device = device
self.text2phone = ArticulatoryCombinedTextFrontend(language=language, add_silence_to_end=True)
checkpoint = torch.load(os.path.join("Models", f"FastSpeech2_{model_name}", "best.pt"), map_location='cpu')
self.use_lang_id = True
try:
self.phone2mel = FastSpeech2(weights=checkpoint["model"]).to(torch.device(device)) # multi speaker multi language
except RuntimeError:
try:
self.use_lang_id = False
self.phone2mel = FastSpeech2(weights=checkpoint["model"], lang_embs=None).to(torch.device(device)) # multi speaker single language
except RuntimeError:
self.phone2mel = FastSpeech2(weights=checkpoint["model"], lang_embs=None, utt_embed_dim=None).to(torch.device(device)) # single speaker
self.mel2wav = HiFiGANGenerator(path_to_weights=os.path.join("Models", "HiFiGAN_combined", "best.pt")).to(torch.device(device))
self.default_utterance_embedding = checkpoint["default_emb"].to(self.device)
self.phone2mel.eval()
self.mel2wav.eval()
self.lang_id = get_language_id(language)
if self.use_lang_id:
self.lang_id = get_language_id(language)
else:
self.lang_id = None
self.to(torch.device(device))
self.noise_reduce = noise_reduce
if self.noise_reduce:
self.prototypical_noise = None
self.update_noise_profile()

def set_utterance_embedding(self, path_to_reference_audio):
wave, sr = soundfile.read(path_to_reference_audio)
self.default_utterance_embedding = ProsodicConditionExtractor(sr=sr).extract_condition_from_reference_wave(wave).to(self.device)
if self.noise_reduce:
self.update_noise_profile()

def update_noise_profile(self):
self.noise_reduce = False
self.prototypical_noise = self("~." * 100, input_is_phones=True).cpu().numpy()
self.noise_reduce = True

def set_language(self, lang_id):
"""
The id parameter actually refers to the shorthand. This has become ambiguous with the introduction of the actual language IDs
"""
self.text2phone = ArticulatoryCombinedTextFrontend(language=lang_id, add_silence_to_end=True)
self.lang_id = get_language_id(lang_id).to(self.device)
if self.use_lang_id:
self.lang_id = get_language_id(lang_id).to(self.device)
else:
self.lang_id = None

def forward(self, text, view=False, durations=None, pitch=None, energy=None):
def forward(self, text, view=False, durations=None, pitch=None, energy=None, input_is_phones=False):
with torch.inference_mode():
phones = self.text2phone.string_to_tensor(text).to(torch.device(self.device))
phones = self.text2phone.string_to_tensor(text, input_phonemes=input_is_phones).to(torch.device(self.device))
mel, durations, pitch, energy = self.phone2mel(phones,
return_duration_pitch_energy=True,
utterance_embedding=self.default_utterance_embedding,
durations=durations,
pitch=pitch,
energy=energy)
energy=energy,
lang_id=self.lang_id)
mel = mel.transpose(0, 1)
wave = self.mel2wav(mel)
if view:
Expand All @@ -78,13 +99,19 @@ def forward(self, text, view=False, durations=None, pitch=None, energy=None):
ax[0].set_title(text)
plt.subplots_adjust(left=0.05, bottom=0.1, right=0.95, top=.9, wspace=0.0, hspace=0.0)
plt.show()
if self.noise_reduce:
wave = torch.tensor(noisereduce.reduce_noise(y=wave.cpu().numpy(), y_noise=self.prototypical_noise, sr=48000, stationary=True), device=self.device)
return wave

def read_to_file(self, text_list, file_location, silent=False, dur_list=None, pitch_list=None, energy_list=None):
"""
:param silent: Whether to be verbose about the process
:param text_list: A list of strings to be read
:param file_location: The path and name of the file it should be saved to
Args:
silent: Whether to be verbose about the process
text_list: A list of strings to be read
file_location: The path and name of the file it should be saved to
energy_list: list of energy tensors to be used for the texts
pitch_list: list of pitch tensors to be used for the texts
dur_list: list of duration tensors to be used for the texts
"""
if not dur_list:
dur_list = []
Expand Down
6 changes: 3 additions & 3 deletions Preprocessing/ArticulatoryCombinedTextFrontend.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,9 +288,7 @@ def english_text_expansion(text):


def get_language_id(language):
if language == "en":
return torch.LongTensor([0])
elif language == "de":
if language == "de":
return torch.LongTensor([1])
elif language == "el":
return torch.LongTensor([2])
Expand All @@ -312,6 +310,8 @@ def get_language_id(language):
return torch.LongTensor([10])
elif language == "it":
return torch.LongTensor([11])
elif language == "en":
return torch.LongTensor([12])


if __name__ == '__main__':
Expand Down
14 changes: 11 additions & 3 deletions Preprocessing/AudioPreprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

class AudioPreprocessor:

def __init__(self, input_sr, output_sr=None, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False, device="cpu"):
def __init__(self, input_sr, output_sr=None, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False, device="cpu", fmax_for_spec=8000):
"""
The parameters are by default set up to do well
on a 16kHz signal. A different sampling rate may
Expand All @@ -28,6 +28,7 @@ def __init__(self, input_sr, output_sr=None, melspec_buckets=80, hop_length=256,
self.mel_buckets = melspec_buckets
self.meter = pyln.Meter(input_sr)
self.final_sr = input_sr
self.fmax_for_spec = fmax_for_spec
if cut_silence:
torch.hub._validate_not_a_forked_repo = lambda a, b, c: True # torch 1.9 has a bug in the hub loading, this is a workaround
# careful: assumes 16kHz or 8kHz audio
Expand Down Expand Up @@ -58,7 +59,12 @@ def cut_silence_from_audio(self, audio):
"""
with torch.inference_mode():
speech_timestamps = self.get_speech_timestamps(audio, self.silero_model, sampling_rate=self.final_sr)
return audio[speech_timestamps[0]['start']:speech_timestamps[-1]['end']]
try:
result = audio[speech_timestamps[0]['start']:speech_timestamps[-1]['end']]
return result
except IndexError:
print("Audio might be too short to cut silences from front and back.")
return audio

def to_mono(self, x):
"""
Expand All @@ -82,7 +88,7 @@ def normalize_loudness(self, audio):
peak_normed = numpy.divide(loud_normed, peak)
return peak_normed

def logmelfilterbank(self, audio, sampling_rate, fmin=40, fmax=8000, eps=1e-10):
def logmelfilterbank(self, audio, sampling_rate, fmin=40, fmax=None, eps=1e-10):
"""
Compute log-Mel filterbank
Expand All @@ -91,6 +97,8 @@ def logmelfilterbank(self, audio, sampling_rate, fmin=40, fmax=8000, eps=1e-10):
compatibility, this is kept for now. If there is ever a reason to completely re-train
all models, this would be a good opportunity to make the switch.
"""
if fmax is None:
fmax = self.fmax_for_spec
if isinstance(audio, torch.Tensor):
audio = audio.numpy()
# get amplitude spectrogram
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from speechbrain.pretrained import EncoderClassifier
from torch.multiprocessing import Manager
from torch.multiprocessing import Process
from torch.multiprocessing import set_start_method
from torch.utils.data import Dataset
from tqdm import tqdm

Expand All @@ -28,17 +27,12 @@ def __init__(self,
cut_silences=True,
rebuild_cache=False,
verbose=False,
device="cpu"):
device="cpu",
phone_input=False):
os.makedirs(cache_dir, exist_ok=True)
if not os.path.exists(os.path.join(cache_dir, "aligner_train_cache.pt")) or rebuild_cache:
if (device == "cuda" or device == torch.device("cuda")) and cut_silences:
try:
set_start_method('spawn') # in order to be able to make use of cuda in multiprocessing
except RuntimeError:
pass
elif cut_silences:
torch.set_num_threads(1)
if cut_silences:
torch.set_num_threads(1)
torch.hub.load(repo_or_dir='snakers4/silero-vad',
model='silero_vad',
force_reload=False,
Expand Down Expand Up @@ -68,7 +62,8 @@ def __init__(self,
max_len_in_seconds,
cut_silences,
verbose,
device),
"cpu",
phone_input),
daemon=True))
process_list[-1].start()
for process in process_list:
Expand Down Expand Up @@ -140,7 +135,8 @@ def cache_builder_process(self,
max_len,
cut_silences,
verbose,
device):
device,
phone_input):
process_internal_dataset_chunk = list()
tf = ArticulatoryCombinedTextFrontend(language=lang, use_word_boundaries=False)
_, sr = sf.read(path_list[0])
Expand Down Expand Up @@ -171,9 +167,9 @@ def cache_builder_process(self,
# raw audio preprocessing is done
transcript = self.path_to_transcript_dict[path]
try:
cached_text = tf.string_to_tensor(transcript, handle_missing=False).squeeze(0).cpu().numpy()
cached_text = tf.string_to_tensor(transcript, handle_missing=False, input_phonemes=phone_input).squeeze(0).cpu().numpy()
except KeyError:
tf.string_to_tensor(transcript, handle_missing=True).squeeze(0).cpu().numpy()
tf.string_to_tensor(transcript, handle_missing=True, input_phonemes=phone_input).squeeze(0).cpu().numpy()
continue # we skip sentences with unknown symbols
try:
if len(cached_text[0]) != 66:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import random

import librosa.display as lbd
import matplotlib.pyplot as plt
import torch
Expand Down Expand Up @@ -48,13 +50,7 @@ def train_loop(net,
train_iters.append(iter(train_loaders[-1]))
default_embeddings = {"en": None, "de": None, "el": None, "es": None, "fi": None, "ru": None, "hu": None, "nl": None, "fr": None}
for index, lang in enumerate(["en", "de", "el", "es", "fi", "ru", "hu", "nl", "fr"]):
default_embedding = None
for datapoint in datasets[index]:
if default_embedding is None:
default_embedding = datapoint[7].squeeze()
else:
default_embedding = default_embedding + datapoint[7].squeeze()
default_embeddings[lang] = (default_embedding / len(datasets[index])).to(device)
default_embeddings[lang] = datasets[index][0][7].squeeze().to(device)
optimizer = torch.optim.RAdam(net.parameters(), lr=lr, eps=1.0e-06, weight_decay=0.0)
grad_scaler = GradScaler()
scheduler = WarmupScheduler(optimizer, warmup_steps=warmup_steps)
Expand Down Expand Up @@ -84,7 +80,7 @@ def train_loop(net,
# =============================
for step in tqdm(range(step_counter, steps)):
batches = []
for index in range(len(datasets)):
for index in random.sample(list(range(len(datasets))), len(datasets)):
# we get one batch for each task (i.e. language in this case)
try:
batch = next(train_iters[index])
Expand Down
73 changes: 73 additions & 0 deletions TrainingInterfaces/TrainingPipelines/FastSpeech2_English.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import random

import torch
from torch.utils.data import ConcatDataset

from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2 import FastSpeech2
from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.fastspeech2_train_loop import train_loop
from Utility.corpus_preparation import prepare_fastspeech_corpus
from Utility.path_to_transcript_dicts import *


def run(gpu_id, resume_checkpoint, finetune, model_dir, resume):
if gpu_id == "cpu":
os.environ["CUDA_VISIBLE_DEVICES"] = ""
device = torch.device("cpu")

else:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"
device = torch.device("cuda")

torch.manual_seed(131714)
random.seed(131714)
torch.random.manual_seed(131714)

print("Preparing")

if model_dir is not None:
save_dir = model_dir
else:
save_dir = os.path.join("Models", "FastSpeech2_English")
os.makedirs(save_dir, exist_ok=True)

datasets = list()
datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_nancy(),
corpus_dir=os.path.join("Corpora", "Nancy"),
lang="en"))

datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_ljspeech(),
corpus_dir=os.path.join("Corpora", "LJSpeech"),
lang="en"))

datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts_all_clean(),
corpus_dir=os.path.join("Corpora", "libri_all_clean"),
lang="en"))

datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_vctk(),
corpus_dir=os.path.join("Corpora", "vctk"),
lang="en"))

datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_nvidia_hifitts(),
corpus_dir=os.path.join("Corpora", "hifi"),
lang="en"))

train_set = ConcatDataset(datasets)

model = FastSpeech2(lang_embs=100)
# because we want to finetune it, we treat it as multilingual, even though we are only interested in German here

print("Training model")
train_loop(net=model,
train_dataset=train_set,
device=device,
save_directory=save_dir,
steps=500000,
batch_size=10,
lang="en",
lr=0.001,
epochs_per_save=1,
warmup_steps=4000,
path_to_checkpoint="Models/FastSpeech2_Meta/best.pt",
fine_tune=True,
resume=resume)
Loading

0 comments on commit 81075a6

Please sign in to comment.