From 6865a9d7f6a57e878e6dab5a98b1df4ae9cb5427 Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Mon, 28 Feb 2022 18:53:30 +0100 Subject: [PATCH 01/33] re-generate some experiment audio --- .../EvaluationScripts/SpeakerVisualization.py | 51 ++++++++++++------- generate_audio_for_experiments.py | 29 +++++++++++ 2 files changed, 63 insertions(+), 17 deletions(-) create mode 100644 generate_audio_for_experiments.py diff --git a/Utility/EvaluationScripts/SpeakerVisualization.py b/Utility/EvaluationScripts/SpeakerVisualization.py index 90918bb0..cc533277 100644 --- a/Utility/EvaluationScripts/SpeakerVisualization.py +++ b/Utility/EvaluationScripts/SpeakerVisualization.py @@ -2,7 +2,6 @@ import numpy import soundfile as sf from matplotlib import pyplot as plt -from matplotlib.markers import MarkerStyle matplotlib.use("tkAgg") from sklearn.manifold import TSNE @@ -20,12 +19,12 @@ def __init__(self, sr=48000, device="cpu"): Args: sr: The sampling rate of the audios you want to visualize. """ - self.tsne = TSNE(verbose=1, learning_rate=4, perplexity=30, n_iter=200000, n_iter_without_progress=8000, init='pca', n_jobs=-1) + self.tsne = TSNE(n_jobs=-1) self.pca = PCA(n_components=2) self.pros_cond_ext = ProsodicConditionExtractor(sr=sr, device=device) self.sr = sr - def visualize_speaker_embeddings(self, label_to_filepaths, title_of_plot, save_file_path=None, include_pca=True): + def visualize_speaker_embeddings(self, label_to_filepaths, title_of_plot, save_file_path=None, include_pca=True, legend=True): label_list = list() embedding_list = list() for label in tqdm(label_to_filepaths): @@ -47,30 +46,48 @@ def visualize_speaker_embeddings(self, label_to_filepaths, title_of_plot, save_f self._plot_embeddings(projected_data=dimensionality_reduced_embeddings_tsne, labels=label_list, title=title_of_plot + " t-SNE" if include_pca else title_of_plot, - save_file_path=save_file_path) + save_file_path=save_file_path, + legend=legend) if include_pca: dimensionality_reduced_embeddings_pca = self.pca.fit_transform(embeddings_as_array) self._plot_embeddings(projected_data=dimensionality_reduced_embeddings_pca, labels=label_list, title=title_of_plot + " PCA", - save_file_path=save_file_path) + save_file_path=save_file_path, + legend=legend) - def _plot_embeddings(self, projected_data, labels, title, save_file_path): + def _plot_embeddings(self, projected_data, labels, title, save_file_path, legend): label_to_color = dict() for index, label in enumerate(list(set(labels))): label_to_color[label] = (1 / len(labels)) * index - plt.clf() - plt.scatter(x=[x[0] for x in projected_data], - y=[x[1] for x in projected_data], - marker=MarkerStyle(marker="."), - c=[label_to_color[label] for label in labels], - cmap='gist_rainbow', - alpha=0.6) - plt.tight_layout() - plt.axis('off') - plt.subplots_adjust(top=0.9, bottom=0.0, right=1.0, left=0.0) - plt.title(title) + labels_to_points_x = dict() + labels_to_points_y = dict() + for label in labels: + labels_to_points_x[label] = list() + labels_to_points_y[label] = list() + for index, label in enumerate(labels): + labels_to_points_x[label].append(projected_data[index][0]) + labels_to_points_y[label].append(projected_data[index][1]) + + fig, ax = plt.subplots() + for label in set(labels): + x = numpy.array(labels_to_points_x[label]) + y = numpy.array(labels_to_points_y[label]) + print(x.shape) + print(label_to_color[label]) + ax.scatter(x=x, + y=y, + c=[label_to_color[label]] * len(x), + cmap='gist_rainbow', + label=label, + alpha=0.8) + if legend: + ax.legend() + fig.tight_layout() + ax.axis('off') + fig.subplots_adjust(top=0.9, bottom=0.0, right=1.0, left=0.0) + ax.set_title(title) if save_file_path is not None: plt.savefig(save_file_path) else: diff --git a/generate_audio_for_experiments.py b/generate_audio_for_experiments.py new file mode 100644 index 00000000..a86efef9 --- /dev/null +++ b/generate_audio_for_experiments.py @@ -0,0 +1,29 @@ +import os +import random + +import torch +from tqdm import tqdm + +from InferenceInterfaces.InferenceFastSpeech2 import InferenceFastSpeech2 + +torch.manual_seed(131714) +random.seed(131714) +torch.random.manual_seed(131714) + +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "6" # hardcoded gpu ID, be careful with this script + +############################################################################################################################################### + +os.makedirs("experiment_audios/german/low_diff", exist_ok=True) + +tts_low_diff = InferenceFastSpeech2(device="cuda" if torch.cuda.is_available() else "cpu", model_name="Meta_joint_finetune_german", language="de") +tts_low_diff.set_utterance_embedding("audios/german_female.wav") + +with open("experiment_audios/german/low_diff/transcripts_in_kaldi_format.txt", encoding="utf8", mode="r") as f: + trans = f.read() +for index, line in enumerate(tqdm(trans.split("\n"))): + if line.strip() != "": + assert line.startswith(f"{index} ") + text = line.lstrip(f"{index} ") + tts_low_diff.read_to_file([text], silent=True, file_location=f"experiment_audios/german/low_diff/{index}.wav") From cd4528724501e368c7a18c6b04dfba4230b02d1f Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Mon, 28 Feb 2022 19:44:06 +0100 Subject: [PATCH 02/33] fix speaker visualization --- Utility/EvaluationScripts/SpeakerVisualization.py | 12 ++++++------ run_speaker_visualization.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Utility/EvaluationScripts/SpeakerVisualization.py b/Utility/EvaluationScripts/SpeakerVisualization.py index cc533277..f95479eb 100644 --- a/Utility/EvaluationScripts/SpeakerVisualization.py +++ b/Utility/EvaluationScripts/SpeakerVisualization.py @@ -1,6 +1,7 @@ import matplotlib import numpy import soundfile as sf +from matplotlib import cm from matplotlib import pyplot as plt matplotlib.use("tkAgg") @@ -58,9 +59,11 @@ def visualize_speaker_embeddings(self, label_to_filepaths, title_of_plot, save_f legend=legend) def _plot_embeddings(self, projected_data, labels, title, save_file_path, legend): + colors = cm.gist_rainbow(numpy.linspace(0, 1, len(set(labels)))) label_to_color = dict() for index, label in enumerate(list(set(labels))): - label_to_color[label] = (1 / len(labels)) * index + label_to_color[label] = colors[index] + labels_to_points_x = dict() labels_to_points_y = dict() for label in labels: @@ -74,14 +77,11 @@ def _plot_embeddings(self, projected_data, labels, title, save_file_path, legend for label in set(labels): x = numpy.array(labels_to_points_x[label]) y = numpy.array(labels_to_points_y[label]) - print(x.shape) - print(label_to_color[label]) ax.scatter(x=x, y=y, - c=[label_to_color[label]] * len(x), - cmap='gist_rainbow', + c=label_to_color[label], label=label, - alpha=0.8) + alpha=0.9) if legend: ax.legend() fig.tight_layout() diff --git a/run_speaker_visualization.py b/run_speaker_visualization.py index b42ec0d4..a45f8e0c 100644 --- a/run_speaker_visualization.py +++ b/run_speaker_visualization.py @@ -37,7 +37,7 @@ def visualize_speakers_languages_crossover(): ltf = dict() vs = Visualizer() for file in os.listdir("speakers_for_plotting"): - label = file.split("_")[0] + label = file.split("_")[0].capitalize() + " Speaker" if label not in ltf: ltf[label] = list() ltf[label].append(f"speakers_for_plotting/{file}") From d7ad1db6323124611b578db4a61f13c24e1fb469 Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Tue, 1 Mar 2022 00:41:40 +0100 Subject: [PATCH 03/33] add german multi pipeline --- .../TrainingPipelines/FastSpeech2_German.py | 77 +++++++++++++++++++ run_training_pipeline.py | 4 +- 2 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 TrainingInterfaces/TrainingPipelines/FastSpeech2_German.py diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_German.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_German.py new file mode 100644 index 00000000..079d31ab --- /dev/null +++ b/TrainingInterfaces/TrainingPipelines/FastSpeech2_German.py @@ -0,0 +1,77 @@ +import random + +import torch +from torch.utils.data import ConcatDataset + +from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2 import FastSpeech2 +from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.fastspeech2_train_loop import train_loop +from Utility.corpus_preparation import prepare_fastspeech_corpus +from Utility.path_to_transcript_dicts import * + + +def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): + if gpu_id == "cpu": + os.environ["CUDA_VISIBLE_DEVICES"] = "" + device = torch.device("cpu") + + else: + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" + os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}" + device = torch.device("cuda") + + torch.manual_seed(131714) + random.seed(131714) + torch.random.manual_seed(131714) + + print("Preparing") + + if model_dir is not None: + save_dir = model_dir + else: + save_dir = os.path.join("Models", "FastSpeech2_German") + os.makedirs(save_dir, exist_ok=True) + + datasets = list() + datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_karlsson(), + corpus_dir=os.path.join("Corpora", "Karlsson"), + lang="de")) + + datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_eva(), + corpus_dir=os.path.join("Corpora", "Eva"), + lang="de")) + + datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_bernd(), + corpus_dir=os.path.join("Corpora", "Bernd"), + lang="de")) + + datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_hokus(), + corpus_dir=os.path.join("Corpora", "Hokus"), + lang="de")) + + datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_hui_others(), + corpus_dir=os.path.join("Corpora", "hui_others"), + lang="de")) + + datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_thorsten(), + corpus_dir=os.path.join("Corpora", "Thorsten"), + lang="de")) + + train_set = ConcatDataset(datasets) + + model = FastSpeech2(lang_embs=100) + # because we want to finetune it, we treat it as multilingual, even though we are only interested in German here + + print("Training model") + train_loop(net=model, + train_dataset=train_set, + device=device, + save_directory=save_dir, + steps=500000, + batch_size=32, + lang="de", + lr=0.001, + epochs_per_save=1, + warmup_steps=4000, + path_to_checkpoint="Models/FastSpeech2_Meta/best.pt", + fine_tune=True, + resume=resume) diff --git a/run_training_pipeline.py b/run_training_pipeline.py index 57b2d0c1..caf26b22 100644 --- a/run_training_pipeline.py +++ b/run_training_pipeline.py @@ -1,6 +1,7 @@ import argparse import sys +from TrainingInterfaces.TrainingPipelines.FastSpeech2_German import run as full_ger from TrainingInterfaces.TrainingPipelines.FastSpeech2_GermanSingle import run as single_ger from TrainingInterfaces.TrainingPipelines.FastSpeech2_Karlsson import run as karlsson from TrainingInterfaces.TrainingPipelines.FastSpeech2_LJ import run as lj @@ -28,7 +29,8 @@ "low_rus" : low_rus, "low_ger" : low_ger, "single_ger" : single_ger, - "single_rus" : single_rus + "single_rus" : single_rus, + "full_ger" : full_ger } if __name__ == '__main__': From 55f6c9c0620e37f420e3b7c00768defcd517f3d2 Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Wed, 2 Mar 2022 21:54:52 +0100 Subject: [PATCH 04/33] allow for phones to be already in the transcript --- .../AutoAligner/AlignerDataset.py | 13 +++-- .../FastSpeech2_LibriTTS_asr_out.py | 51 ++++++++++++++++++ .../FastSpeech2_LibriTTS_asr_phn.py | 53 +++++++++++++++++++ Utility/corpus_preparation.py | 4 +- Utility/path_to_transcript_dicts.py | 26 +++++++++ 5 files changed, 140 insertions(+), 7 deletions(-) create mode 100644 TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py create mode 100644 TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py diff --git a/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/AlignerDataset.py b/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/AlignerDataset.py index 1f6785ee..61842584 100644 --- a/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/AlignerDataset.py +++ b/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/AlignerDataset.py @@ -28,7 +28,8 @@ def __init__(self, cut_silences=True, rebuild_cache=False, verbose=False, - device="cpu"): + device="cpu", + phone_input=False): os.makedirs(cache_dir, exist_ok=True) if not os.path.exists(os.path.join(cache_dir, "aligner_train_cache.pt")) or rebuild_cache: if (device == "cuda" or device == torch.device("cuda")) and cut_silences: @@ -68,7 +69,8 @@ def __init__(self, max_len_in_seconds, cut_silences, verbose, - device), + device, + phone_input), daemon=True)) process_list[-1].start() for process in process_list: @@ -140,7 +142,8 @@ def cache_builder_process(self, max_len, cut_silences, verbose, - device): + device, + phone_input): process_internal_dataset_chunk = list() tf = ArticulatoryCombinedTextFrontend(language=lang, use_word_boundaries=False) _, sr = sf.read(path_list[0]) @@ -171,9 +174,9 @@ def cache_builder_process(self, # raw audio preprocessing is done transcript = self.path_to_transcript_dict[path] try: - cached_text = tf.string_to_tensor(transcript, handle_missing=False).squeeze(0).cpu().numpy() + cached_text = tf.string_to_tensor(transcript, handle_missing=False, input_phonemes=phone_input).squeeze(0).cpu().numpy() except KeyError: - tf.string_to_tensor(transcript, handle_missing=True).squeeze(0).cpu().numpy() + tf.string_to_tensor(transcript, handle_missing=True, input_phonemes=phone_input).squeeze(0).cpu().numpy() continue # we skip sentences with unknown symbols try: if len(cached_text[0]) != 66: diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py new file mode 100644 index 00000000..1a172714 --- /dev/null +++ b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py @@ -0,0 +1,51 @@ +import random + +import torch + +from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2 import FastSpeech2 +from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.fastspeech2_train_loop import train_loop +from Utility.corpus_preparation import prepare_fastspeech_corpus +from Utility.path_to_transcript_dicts import * + + +def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): + if gpu_id == "cpu": + os.environ["CUDA_VISIBLE_DEVICES"] = "" + device = torch.device("cpu") + + else: + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" + os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(gpu_id) + device = torch.device("cuda") + + torch.manual_seed(131714) + random.seed(131714) + torch.random.manual_seed(131714) + + print("Preparing") + + if model_dir is not None: + save_dir = model_dir + else: + save_dir = os.path.join("Models", "FastSpeech2_LibriTTS_asr_out") + os.makedirs(save_dir, exist_ok=True) + + train_set = prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts_asr_out(), + corpus_dir=os.path.join("Corpora", "libri_asr_out"), + lang="en") + + model = FastSpeech2(lang_embs=None) + + print("Training model") + train_loop(net=model, + train_dataset=train_set, + device=device, + save_directory=save_dir, + steps=500000, + batch_size=32, + lang="en", + lr=0.0001, + warmup_steps=4000, + path_to_checkpoint="Models/FastSpeech2_LibriTTS/best.pt", + fine_tune=True, + resume=resume) diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py new file mode 100644 index 00000000..95f6bcea --- /dev/null +++ b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py @@ -0,0 +1,53 @@ +import random + +import torch + +from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2 import FastSpeech2 +from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.fastspeech2_train_loop import train_loop +from Utility.corpus_preparation import prepare_fastspeech_corpus +from Utility.path_to_transcript_dicts import * + + +def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): + if gpu_id == "cpu": + os.environ["CUDA_VISIBLE_DEVICES"] = "" + device = torch.device("cpu") + + else: + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" + os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(gpu_id) + device = torch.device("cuda") + + torch.manual_seed(131714) + random.seed(131714) + torch.random.manual_seed(131714) + + print("Preparing") + + if model_dir is not None: + save_dir = model_dir + else: + save_dir = os.path.join("Models", "FastSpeech2_LibriTTS_asr_phn") + os.makedirs(save_dir, exist_ok=True) + + train_set = prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts_asr_phn(), + corpus_dir=os.path.join("Corpora", "libri_asr_phn"), + lang="en", + phone_input=True, + ctc_selection=False) + + model = FastSpeech2(lang_embs=None) + + print("Training model") + train_loop(net=model, + train_dataset=train_set, + device=device, + save_directory=save_dir, + steps=500000, + batch_size=32, + lang="en", + lr=0.0001, + warmup_steps=4000, + path_to_checkpoint="Models/FastSpeech2_LibriTTS/best.pt", + fine_tune=True, + resume=resume) diff --git a/Utility/corpus_preparation.py b/Utility/corpus_preparation.py index 6bf6b1c6..e423adf4 100644 --- a/Utility/corpus_preparation.py +++ b/Utility/corpus_preparation.py @@ -11,7 +11,7 @@ def prepare_aligner_corpus(transcript_dict, corpus_dir, lang, device): return AlignerDataset(transcript_dict, cache_dir=corpus_dir, lang=lang, loading_processes=35, cut_silences=True, device=device) -def prepare_fastspeech_corpus(transcript_dict, corpus_dir, lang, ctc_selection=True, fine_tune_aligner=True, use_reconstruction=False): +def prepare_fastspeech_corpus(transcript_dict, corpus_dir, lang, ctc_selection=True, fine_tune_aligner=True, use_reconstruction=False, phone_input=False): """ create an aligner dataset, fine-tune an aligner, @@ -23,7 +23,7 @@ def prepare_fastspeech_corpus(transcript_dict, corpus_dir, lang, ctc_selection=T if fine_tune_aligner: aligner_dir = os.path.join(corpus_dir, "aligner") if not os.path.exists(os.path.join(aligner_dir, "aligner.pt")): - aligner_datapoints = AlignerDataset(transcript_dict, cache_dir=corpus_dir, lang=lang) + aligner_datapoints = AlignerDataset(transcript_dict, cache_dir=corpus_dir, lang=lang, phone_input=phone_input) train_aligner(train_dataset=aligner_datapoints, device=torch.device("cuda"), save_directory=aligner_dir, diff --git a/Utility/path_to_transcript_dicts.py b/Utility/path_to_transcript_dicts.py index 758e8163..080f277c 100644 --- a/Utility/path_to_transcript_dicts.py +++ b/Utility/path_to_transcript_dicts.py @@ -189,6 +189,32 @@ def build_path_to_transcript_dict_libritts(): return path_to_transcript +def build_path_to_transcript_dict_libritts_asr(label_file): + with open(label_file, encoding="utf8", mode="r") as f: + labels = f.read() + audio_handle_to_transcript = dict() + for line in labels: + if line.strip() == "": + continue + audio_handle_to_transcript[line.split()[0]] = line.lstrip(f"{line.split()[0]} ") + path_train = "/mount/resources/speech/corpora/LibriTTS/train-clean-100" + path_to_transcript = dict() + for speaker in os.listdir(path_train): + for chapter in os.listdir(os.path.join(path_train, speaker)): + for file in os.listdir(os.path.join(path_train, speaker, chapter)): + if file.endswith(".wav"): + path_to_transcript[os.path.join(path_train, speaker, chapter, file)] = audio_handle_to_transcript[file.split(".")[0]] + return path_to_transcript + + +def build_path_to_transcript_dict_libritts_asr_out(): + return build_path_to_transcript_dict_libritts_asr("/mount/arbeitsdaten45/projekte/asr-4/denisopl/tmp/libritts_train_600_tts-bpe100.txt") + + +def build_path_to_transcript_dict_libritts_asr_phn(): + return build_path_to_transcript_dict_libritts_asr("/mount/arbeitsdaten45/projekte/asr-4/denisopl/tmp/libritts_train_600_tts-phn-bpe100.txt") + + def build_path_to_transcript_dict_ljspeech(): path_to_transcript = dict() for transcript_file in os.listdir("/mount/resources/speech/corpora/LJSpeech/16kHz/txt"): From 640364f33ed1496b8a52c4c124ba515a02e380dc Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Wed, 2 Mar 2022 21:58:52 +0100 Subject: [PATCH 05/33] allow for phones to be already in the transcript --- InferenceInterfaces/InferenceFastSpeech2.py | 4 +-- generate_audio_for_experiments.py | 29 --------------------- run_training_pipeline.py | 6 ++++- 3 files changed, 7 insertions(+), 32 deletions(-) delete mode 100644 generate_audio_for_experiments.py diff --git a/InferenceInterfaces/InferenceFastSpeech2.py b/InferenceInterfaces/InferenceFastSpeech2.py index e3c5ea72..77a5a54c 100644 --- a/InferenceInterfaces/InferenceFastSpeech2.py +++ b/InferenceInterfaces/InferenceFastSpeech2.py @@ -46,9 +46,9 @@ def set_language(self, lang_id): self.text2phone = ArticulatoryCombinedTextFrontend(language=lang_id, add_silence_to_end=True) self.lang_id = get_language_id(lang_id).to(self.device) - def forward(self, text, view=False, durations=None, pitch=None, energy=None): + def forward(self, text, view=False, durations=None, pitch=None, energy=None, input_is_phones=False): with torch.inference_mode(): - phones = self.text2phone.string_to_tensor(text).to(torch.device(self.device)) + phones = self.text2phone.string_to_tensor(text, input_phonemes=input_is_phones).to(torch.device(self.device)) mel, durations, pitch, energy = self.phone2mel(phones, return_duration_pitch_energy=True, utterance_embedding=self.default_utterance_embedding, diff --git a/generate_audio_for_experiments.py b/generate_audio_for_experiments.py deleted file mode 100644 index a86efef9..00000000 --- a/generate_audio_for_experiments.py +++ /dev/null @@ -1,29 +0,0 @@ -import os -import random - -import torch -from tqdm import tqdm - -from InferenceInterfaces.InferenceFastSpeech2 import InferenceFastSpeech2 - -torch.manual_seed(131714) -random.seed(131714) -torch.random.manual_seed(131714) - -os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" -os.environ["CUDA_VISIBLE_DEVICES"] = "6" # hardcoded gpu ID, be careful with this script - -############################################################################################################################################### - -os.makedirs("experiment_audios/german/low_diff", exist_ok=True) - -tts_low_diff = InferenceFastSpeech2(device="cuda" if torch.cuda.is_available() else "cpu", model_name="Meta_joint_finetune_german", language="de") -tts_low_diff.set_utterance_embedding("audios/german_female.wav") - -with open("experiment_audios/german/low_diff/transcripts_in_kaldi_format.txt", encoding="utf8", mode="r") as f: - trans = f.read() -for index, line in enumerate(tqdm(trans.split("\n"))): - if line.strip() != "": - assert line.startswith(f"{index} ") - text = line.lstrip(f"{index} ") - tts_low_diff.read_to_file([text], silent=True, file_location=f"experiment_audios/german/low_diff/{index}.wav") diff --git a/run_training_pipeline.py b/run_training_pipeline.py index caf26b22..ecf214a9 100644 --- a/run_training_pipeline.py +++ b/run_training_pipeline.py @@ -7,6 +7,8 @@ from TrainingInterfaces.TrainingPipelines.FastSpeech2_LJ import run as lj from TrainingInterfaces.TrainingPipelines.FastSpeech2_LJ_long import run as lj_long from TrainingInterfaces.TrainingPipelines.FastSpeech2_LibriTTS import run as libri +from TrainingInterfaces.TrainingPipelines.FastSpeech2_LibriTTS_asr_out import run as asr_out +from TrainingInterfaces.TrainingPipelines.FastSpeech2_LibriTTS_asr_phn import run as asr_phn from TrainingInterfaces.TrainingPipelines.FastSpeech2_MetaCheckpoint import run as meta_fast from TrainingInterfaces.TrainingPipelines.FastSpeech2_MetaCheckpoint_germ_finetune import run as low_ger from TrainingInterfaces.TrainingPipelines.FastSpeech2_MetaCheckpoint_no_Germanic import run as no_ger @@ -30,7 +32,9 @@ "low_ger" : low_ger, "single_ger" : single_ger, "single_rus" : single_rus, - "full_ger" : full_ger + "full_ger" : full_ger, + "asr_out" : asr_out, + "asr_phn" : asr_phn } if __name__ == '__main__': From 0e37ec1bd72d2d3e2f4ad8c7de7dd84287aa9892 Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Wed, 2 Mar 2022 22:07:12 +0100 Subject: [PATCH 06/33] fix missing linebreak --- Utility/path_to_transcript_dicts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Utility/path_to_transcript_dicts.py b/Utility/path_to_transcript_dicts.py index 080f277c..c3f07448 100644 --- a/Utility/path_to_transcript_dicts.py +++ b/Utility/path_to_transcript_dicts.py @@ -193,7 +193,7 @@ def build_path_to_transcript_dict_libritts_asr(label_file): with open(label_file, encoding="utf8", mode="r") as f: labels = f.read() audio_handle_to_transcript = dict() - for line in labels: + for line in labels.split("\n"): if line.strip() == "": continue audio_handle_to_transcript[line.split()[0]] = line.lstrip(f"{line.split()[0]} ") From bd42005857fd7fdf10f13f77ca03ed05e298d983 Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Thu, 3 Mar 2022 00:56:15 +0100 Subject: [PATCH 07/33] minor fixes --- Preprocessing/AudioPreprocessor.py | 7 ++++++- .../{FastSpeech2_LJ_long.py => FastSpeech2_Nancy.py} | 12 ++++++------ Utility/corpus_preparation.py | 2 +- Utility/path_to_transcript_dicts.py | 5 ++++- run_training_pipeline.py | 4 ++-- 5 files changed, 19 insertions(+), 11 deletions(-) rename TrainingInterfaces/TrainingPipelines/{FastSpeech2_LJ_long.py => FastSpeech2_Nancy.py} (83%) diff --git a/Preprocessing/AudioPreprocessor.py b/Preprocessing/AudioPreprocessor.py index a98cea86..5adaae5d 100644 --- a/Preprocessing/AudioPreprocessor.py +++ b/Preprocessing/AudioPreprocessor.py @@ -58,7 +58,12 @@ def cut_silence_from_audio(self, audio): """ with torch.inference_mode(): speech_timestamps = self.get_speech_timestamps(audio, self.silero_model, sampling_rate=self.final_sr) - return audio[speech_timestamps[0]['start']:speech_timestamps[-1]['end']] + try: + result = audio[speech_timestamps[0]['start']:speech_timestamps[-1]['end']] + return result + except IndexError: + print("Audio might be too short to cut silences from front and back.") + return audio def to_mono(self, x): """ diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LJ_long.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_Nancy.py similarity index 83% rename from TrainingInterfaces/TrainingPipelines/FastSpeech2_LJ_long.py rename to TrainingInterfaces/TrainingPipelines/FastSpeech2_Nancy.py index 13e0f3a5..c71b30c4 100644 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LJ_long.py +++ b/TrainingInterfaces/TrainingPipelines/FastSpeech2_Nancy.py @@ -27,14 +27,14 @@ def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): if model_dir is not None: save_dir = model_dir else: - save_dir = os.path.join("Models", "FastSpeech2_LJ_long") + save_dir = os.path.join("Models", "FastSpeech2_Nancy") os.makedirs(save_dir, exist_ok=True) - train_set = prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_3xljspeech(), - corpus_dir=os.path.join("Corpora", "LJSpeech_long"), + train_set = prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_nancy(), + corpus_dir=os.path.join("Corpora", "Nancy"), lang="en") - model = FastSpeech2(lang_embs=None, utt_embed_dim=None) + model = FastSpeech2() print("Training model") train_loop(net=model, @@ -47,6 +47,6 @@ def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): lr=0.001, epochs_per_save=15, warmup_steps=4000, - path_to_checkpoint=resume_checkpoint, - fine_tune=finetune, + path_to_checkpoint="Models/FastSpeech2_Meta/best.pt", + fine_tune=True, resume=resume) diff --git a/Utility/corpus_preparation.py b/Utility/corpus_preparation.py index e423adf4..31e2ed96 100644 --- a/Utility/corpus_preparation.py +++ b/Utility/corpus_preparation.py @@ -23,7 +23,7 @@ def prepare_fastspeech_corpus(transcript_dict, corpus_dir, lang, ctc_selection=T if fine_tune_aligner: aligner_dir = os.path.join(corpus_dir, "aligner") if not os.path.exists(os.path.join(aligner_dir, "aligner.pt")): - aligner_datapoints = AlignerDataset(transcript_dict, cache_dir=corpus_dir, lang=lang, phone_input=phone_input) + aligner_datapoints = AlignerDataset(transcript_dict, cache_dir=corpus_dir, lang=lang, phone_input=phone_input, device=torch.device("cuda")) train_aligner(train_dataset=aligner_datapoints, device=torch.device("cuda"), save_directory=aligner_dir, diff --git a/Utility/path_to_transcript_dicts.py b/Utility/path_to_transcript_dicts.py index c3f07448..6e2c390f 100644 --- a/Utility/path_to_transcript_dicts.py +++ b/Utility/path_to_transcript_dicts.py @@ -203,7 +203,10 @@ def build_path_to_transcript_dict_libritts_asr(label_file): for chapter in os.listdir(os.path.join(path_train, speaker)): for file in os.listdir(os.path.join(path_train, speaker, chapter)): if file.endswith(".wav"): - path_to_transcript[os.path.join(path_train, speaker, chapter, file)] = audio_handle_to_transcript[file.split(".")[0]] + try: + path_to_transcript[os.path.join(path_train, speaker, chapter, file)] = audio_handle_to_transcript[file.split(".")[0]] + except KeyError: + print(f"Problem with {file}, no transcription found!") return path_to_transcript diff --git a/run_training_pipeline.py b/run_training_pipeline.py index ecf214a9..9610394a 100644 --- a/run_training_pipeline.py +++ b/run_training_pipeline.py @@ -5,7 +5,6 @@ from TrainingInterfaces.TrainingPipelines.FastSpeech2_GermanSingle import run as single_ger from TrainingInterfaces.TrainingPipelines.FastSpeech2_Karlsson import run as karlsson from TrainingInterfaces.TrainingPipelines.FastSpeech2_LJ import run as lj -from TrainingInterfaces.TrainingPipelines.FastSpeech2_LJ_long import run as lj_long from TrainingInterfaces.TrainingPipelines.FastSpeech2_LibriTTS import run as libri from TrainingInterfaces.TrainingPipelines.FastSpeech2_LibriTTS_asr_out import run as asr_out from TrainingInterfaces.TrainingPipelines.FastSpeech2_LibriTTS_asr_phn import run as asr_phn @@ -14,6 +13,7 @@ from TrainingInterfaces.TrainingPipelines.FastSpeech2_MetaCheckpoint_no_Germanic import run as no_ger from TrainingInterfaces.TrainingPipelines.FastSpeech2_MetaCheckpoint_no_Slavic import run as no_slav from TrainingInterfaces.TrainingPipelines.FastSpeech2_MetaCheckpoint_rus_finetune import run as low_rus +from TrainingInterfaces.TrainingPipelines.FastSpeech2_Nancy import run as nancy from TrainingInterfaces.TrainingPipelines.FastSpeech2_RussianSingle import run as single_rus from TrainingInterfaces.TrainingPipelines.HiFiGAN_combined import run as hifigan_combined from TrainingInterfaces.TrainingPipelines.pretrain_aligner import run as aligner @@ -23,7 +23,7 @@ "meta" : meta_fast, "karlsson" : karlsson, "lj" : lj, - "lj_long" : lj_long, + "nancy" : nancy, "hifi_combined": hifigan_combined, "aligner" : aligner, "no_ger" : no_ger, From e0e237fa706a86d0a90f5e0331b177499d27c87c Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Thu, 3 Mar 2022 11:52:26 +0100 Subject: [PATCH 08/33] run silence cutoff always on cpu --- .../Text_to_Spectrogram/AutoAligner/AlignerDataset.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/AlignerDataset.py b/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/AlignerDataset.py index 61842584..1ae450fa 100644 --- a/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/AlignerDataset.py +++ b/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/AlignerDataset.py @@ -8,7 +8,6 @@ from speechbrain.pretrained import EncoderClassifier from torch.multiprocessing import Manager from torch.multiprocessing import Process -from torch.multiprocessing import set_start_method from torch.utils.data import Dataset from tqdm import tqdm @@ -32,14 +31,8 @@ def __init__(self, phone_input=False): os.makedirs(cache_dir, exist_ok=True) if not os.path.exists(os.path.join(cache_dir, "aligner_train_cache.pt")) or rebuild_cache: - if (device == "cuda" or device == torch.device("cuda")) and cut_silences: - try: - set_start_method('spawn') # in order to be able to make use of cuda in multiprocessing - except RuntimeError: - pass - elif cut_silences: - torch.set_num_threads(1) if cut_silences: + torch.set_num_threads(1) torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False, @@ -69,7 +62,7 @@ def __init__(self, max_len_in_seconds, cut_silences, verbose, - device, + "cpu", phone_input), daemon=True)) process_list[-1].start() From 7298b3f33b8a4336a740ff80c2098125b35243f5 Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Thu, 3 Mar 2022 12:00:28 +0100 Subject: [PATCH 09/33] run libri pipeline in find faulty samples mode --- .../FastSpeech2_LibriTTS_asr_out.py | 49 ++++++++++++++++++- .../FastSpeech2_LibriTTS_asr_phn.py | 49 ++++++++++++++++++- 2 files changed, 96 insertions(+), 2 deletions(-) diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py index 1a172714..721bf8cc 100644 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py +++ b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py @@ -1,6 +1,7 @@ import random import torch +from tqdm import tqdm from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2 import FastSpeech2 from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.fastspeech2_train_loop import train_loop @@ -34,7 +35,15 @@ def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): corpus_dir=os.path.join("Corpora", "libri_asr_out"), lang="en") - model = FastSpeech2(lang_embs=None) + model = FastSpeech2() + + find_faulty_samples(net=model, + datasets=train_set, + device=torch.device("cuda"), + path_to_checkpoint="Models/FastSpeech2_LibriTTS/best.pt") + + import sys + sys.exit() print("Training model") train_loop(net=model, @@ -49,3 +58,41 @@ def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): path_to_checkpoint="Models/FastSpeech2_LibriTTS/best.pt", fine_tune=True, resume=resume) + + +@torch.inference_mode() +def find_faulty_samples(net, + datasets, + device, + path_to_checkpoint): + net = net.to(device) + torch.multiprocessing.set_sharing_strategy('file_system') + check_dict = torch.load(os.path.join(path_to_checkpoint), map_location=device) + net.load_state_dict(check_dict["model"]) + losses = list() + index_pairs = list() + for datapoint_index in tqdm(range(len(datasets))): + loss = net(text_tensors=datasets[datapoint_index][0].unsqueeze(0).to(device), + text_lengths=datasets[datapoint_index][1].to(device), + gold_speech=datasets[datapoint_index][2].unsqueeze(0).to(device), + speech_lengths=datasets[datapoint_index][3].to(device), + gold_durations=datasets[datapoint_index][4].unsqueeze(0).to(device), + gold_pitch=datasets[datapoint_index][6].unsqueeze(0).to(device), # mind the switched order + gold_energy=datasets[datapoint_index][5].unsqueeze(0).to(device), # mind the switched order + utterance_embedding=datasets[datapoint_index][7].unsqueeze(0).to(device), + lang_ids=datasets[datapoint_index][8].unsqueeze(0).to(device), + return_mels=False).squeeze() + if torch.isnan(loss): + print(f"CAREFUL, NAN DETECTED: {datapoint_index}") + losses.append(999999) + else: + losses.append(loss.item()) + index_pairs.append(datapoint_index) + + loss_high_to_low = sorted(losses, reverse=True) + print(loss_high_to_low) + threshold = loss_high_to_low[500] + for index, loss in enumerate(losses): + if loss > threshold: + print(index_pairs[index]) + print(loss) diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py index 95f6bcea..fe4ce728 100644 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py +++ b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py @@ -1,6 +1,7 @@ import random import torch +from tqdm import tqdm from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2 import FastSpeech2 from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.fastspeech2_train_loop import train_loop @@ -36,7 +37,15 @@ def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): phone_input=True, ctc_selection=False) - model = FastSpeech2(lang_embs=None) + model = FastSpeech2() + + find_faulty_samples(net=model, + datasets=train_set, + device=torch.device("cuda"), + path_to_checkpoint="Models/FastSpeech2_LibriTTS/best.pt") + + import sys + sys.exit() print("Training model") train_loop(net=model, @@ -51,3 +60,41 @@ def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): path_to_checkpoint="Models/FastSpeech2_LibriTTS/best.pt", fine_tune=True, resume=resume) + + +@torch.inference_mode() +def find_faulty_samples(net, + datasets, + device, + path_to_checkpoint): + net = net.to(device) + torch.multiprocessing.set_sharing_strategy('file_system') + check_dict = torch.load(os.path.join(path_to_checkpoint), map_location=device) + net.load_state_dict(check_dict["model"]) + losses = list() + index_pairs = list() + for datapoint_index in tqdm(range(len(datasets))): + loss = net(text_tensors=datasets[datapoint_index][0].unsqueeze(0).to(device), + text_lengths=datasets[datapoint_index][1].to(device), + gold_speech=datasets[datapoint_index][2].unsqueeze(0).to(device), + speech_lengths=datasets[datapoint_index][3].to(device), + gold_durations=datasets[datapoint_index][4].unsqueeze(0).to(device), + gold_pitch=datasets[datapoint_index][6].unsqueeze(0).to(device), # mind the switched order + gold_energy=datasets[datapoint_index][5].unsqueeze(0).to(device), # mind the switched order + utterance_embedding=datasets[datapoint_index][7].unsqueeze(0).to(device), + lang_ids=datasets[datapoint_index][8].unsqueeze(0).to(device), + return_mels=False).squeeze() + if torch.isnan(loss): + print(f"CAREFUL, NAN DETECTED: {datapoint_index}") + losses.append(999999) + else: + losses.append(loss.item()) + index_pairs.append(datapoint_index) + + loss_high_to_low = sorted(losses, reverse=True) + print(loss_high_to_low) + threshold = loss_high_to_low[500] + for index, loss in enumerate(losses): + if loss > threshold: + print(index_pairs[index]) + print(loss) From a18ed8a2c6d8a8e4c1621c264bbf910e40737db0 Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Thu, 3 Mar 2022 12:08:37 +0100 Subject: [PATCH 10/33] fix wrong model config --- .../TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py | 4 ++-- .../TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py index 721bf8cc..43373121 100644 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py +++ b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py @@ -35,7 +35,7 @@ def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): corpus_dir=os.path.join("Corpora", "libri_asr_out"), lang="en") - model = FastSpeech2() + model = FastSpeech2(lang_embs=None) find_faulty_samples(net=model, datasets=train_set, @@ -80,7 +80,7 @@ def find_faulty_samples(net, gold_pitch=datasets[datapoint_index][6].unsqueeze(0).to(device), # mind the switched order gold_energy=datasets[datapoint_index][5].unsqueeze(0).to(device), # mind the switched order utterance_embedding=datasets[datapoint_index][7].unsqueeze(0).to(device), - lang_ids=datasets[datapoint_index][8].unsqueeze(0).to(device), + lang_ids=None, return_mels=False).squeeze() if torch.isnan(loss): print(f"CAREFUL, NAN DETECTED: {datapoint_index}") diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py index fe4ce728..5587fbde 100644 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py +++ b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py @@ -37,7 +37,7 @@ def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): phone_input=True, ctc_selection=False) - model = FastSpeech2() + model = FastSpeech2(lang_embs=None) find_faulty_samples(net=model, datasets=train_set, @@ -82,7 +82,7 @@ def find_faulty_samples(net, gold_pitch=datasets[datapoint_index][6].unsqueeze(0).to(device), # mind the switched order gold_energy=datasets[datapoint_index][5].unsqueeze(0).to(device), # mind the switched order utterance_embedding=datasets[datapoint_index][7].unsqueeze(0).to(device), - lang_ids=datasets[datapoint_index][8].unsqueeze(0).to(device), + lang_ids=None, return_mels=False).squeeze() if torch.isnan(loss): print(f"CAREFUL, NAN DETECTED: {datapoint_index}") From 3fb401abf2511be550beb96b192e1b63ef219c4c Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Thu, 3 Mar 2022 12:40:06 +0100 Subject: [PATCH 11/33] revert to regular training mode --- .../TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py | 8 -------- .../TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py | 8 -------- 2 files changed, 16 deletions(-) diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py index 43373121..c432f843 100644 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py +++ b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py @@ -37,14 +37,6 @@ def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): model = FastSpeech2(lang_embs=None) - find_faulty_samples(net=model, - datasets=train_set, - device=torch.device("cuda"), - path_to_checkpoint="Models/FastSpeech2_LibriTTS/best.pt") - - import sys - sys.exit() - print("Training model") train_loop(net=model, train_dataset=train_set, diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py index 5587fbde..e724ed35 100644 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py +++ b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py @@ -39,14 +39,6 @@ def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): model = FastSpeech2(lang_embs=None) - find_faulty_samples(net=model, - datasets=train_set, - device=torch.device("cuda"), - path_to_checkpoint="Models/FastSpeech2_LibriTTS/best.pt") - - import sys - sys.exit() - print("Training model") train_loop(net=model, train_dataset=train_set, From c69e603ab09cb39dd9826d34a27115e5e6048d30 Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Fri, 4 Mar 2022 17:52:23 +0100 Subject: [PATCH 12/33] add libritts 600 recipes --- .../FastSpeech2_LibriTTS_600.py | 94 +++++++++++++++++++ .../FastSpeech2_LibriTTS_asr_phn_600.py | 94 +++++++++++++++++++ Utility/EvaluationScripts/GermanWER.py | 41 -------- Utility/EvaluationScripts/RussianWER.py | 44 --------- Utility/path_to_transcript_dicts.py | 43 +++++++++ run_training_pipeline.py | 6 +- 6 files changed, 236 insertions(+), 86 deletions(-) create mode 100644 TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_600.py create mode 100644 TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn_600.py delete mode 100644 Utility/EvaluationScripts/GermanWER.py delete mode 100644 Utility/EvaluationScripts/RussianWER.py diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_600.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_600.py new file mode 100644 index 00000000..a370528d --- /dev/null +++ b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_600.py @@ -0,0 +1,94 @@ +import random + +import torch +from torch.utils.data import ConcatDataset +from tqdm import tqdm + +from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2 import FastSpeech2 +from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.fastspeech2_train_loop import train_loop +from Utility.corpus_preparation import prepare_fastspeech_corpus +from Utility.path_to_transcript_dicts import * + + +def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): + if gpu_id == "cpu": + os.environ["CUDA_VISIBLE_DEVICES"] = "" + device = torch.device("cpu") + + else: + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" + os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(gpu_id) + device = torch.device("cuda") + + torch.manual_seed(131714) + random.seed(131714) + torch.random.manual_seed(131714) + + print("Preparing") + + if model_dir is not None: + save_dir = model_dir + else: + save_dir = os.path.join("Models", "FastSpeech2_LibriTTS_600") + os.makedirs(save_dir, exist_ok=True) + + train_set = prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts_other500(), + corpus_dir=os.path.join("Corpora", "libri_500"), + lang="en", + phone_input=False, + ctc_selection=True) + + model = FastSpeech2(lang_embs=None) + + find_faulty_samples(model, train_set, device, "Models/FastSpeech2_LibriTTS/best.pt") + + train_sets = list() + train_sets.append(train_set) + train_sets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts(), + corpus_dir=os.path.join("Corpora", "libri"), + lang="en", + phone_input=False, + ctc_selection=True)) + + train_set = ConcatDataset(train_sets) + + print("Training model") + train_loop(net=model, + train_dataset=train_set, + device=device, + save_directory=save_dir, + steps=500000, + batch_size=32, + lang="en", + lr=0.0001, + warmup_steps=4000, + path_to_checkpoint="Models/FastSpeech2_LibriTTS/best.pt", + fine_tune=True, + resume=resume) + + +@torch.inference_mode() +def find_faulty_samples(net, + dataset, + device, + path_to_checkpoint): + nan_ids = list() + net = net.to(device) + torch.multiprocessing.set_sharing_strategy('file_system') + check_dict = torch.load(os.path.join(path_to_checkpoint), map_location=device) + net.load_state_dict(check_dict["model"]) + for datapoint_index in tqdm(range(len(dataset))): + loss = net(text_tensors=dataset[datapoint_index][0].unsqueeze(0).to(device), + text_lengths=dataset[datapoint_index][1].to(device), + gold_speech=dataset[datapoint_index][2].unsqueeze(0).to(device), + speech_lengths=dataset[datapoint_index][3].to(device), + gold_durations=dataset[datapoint_index][4].unsqueeze(0).to(device), + gold_pitch=dataset[datapoint_index][6].unsqueeze(0).to(device), # mind the switched order + gold_energy=dataset[datapoint_index][5].unsqueeze(0).to(device), # mind the switched order + utterance_embedding=dataset[datapoint_index][7].unsqueeze(0).to(device), + lang_ids=None, + return_mels=False).squeeze() + if torch.isnan(loss): + print(f"CAREFUL, NAN DETECTED: {datapoint_index}") + nan_ids.append(datapoint_index) + dataset.remove_samples(nan_ids) diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn_600.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn_600.py new file mode 100644 index 00000000..d2863801 --- /dev/null +++ b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn_600.py @@ -0,0 +1,94 @@ +import random + +import torch +from torch.utils.data import ConcatDataset +from tqdm import tqdm + +from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2 import FastSpeech2 +from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.fastspeech2_train_loop import train_loop +from Utility.corpus_preparation import prepare_fastspeech_corpus +from Utility.path_to_transcript_dicts import * + + +def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): + if gpu_id == "cpu": + os.environ["CUDA_VISIBLE_DEVICES"] = "" + device = torch.device("cpu") + + else: + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" + os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(gpu_id) + device = torch.device("cuda") + + torch.manual_seed(131714) + random.seed(131714) + torch.random.manual_seed(131714) + + print("Preparing") + + if model_dir is not None: + save_dir = model_dir + else: + save_dir = os.path.join("Models", "FastSpeech2_LibriTTS_asr_phn_600") + os.makedirs(save_dir, exist_ok=True) + + train_set = prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts_asr_phn_500(), + corpus_dir=os.path.join("Corpora", "libri_asr_phn_500"), + lang="en", + phone_input=True, + ctc_selection=False) + + model = FastSpeech2(lang_embs=None) + + find_faulty_samples(model, train_set, device, "Models/FastSpeech2_LibriTTS_asr_phn/best.pt") + + train_sets = list() + train_sets.append(train_set) + train_sets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts_asr_phn(), + corpus_dir=os.path.join("Corpora", "libri_asr_phn"), + lang="en", + phone_input=True, + ctc_selection=False)) + + train_set = ConcatDataset(train_sets) + + print("Training model") + train_loop(net=model, + train_dataset=train_set, + device=device, + save_directory=save_dir, + steps=500000, + batch_size=32, + lang="en", + lr=0.0001, + warmup_steps=4000, + path_to_checkpoint="Models/FastSpeech2_LibriTTS_asr_phn/best.pt", + fine_tune=True, + resume=resume) + + +@torch.inference_mode() +def find_faulty_samples(net, + dataset, + device, + path_to_checkpoint): + nan_ids = list() + net = net.to(device) + torch.multiprocessing.set_sharing_strategy('file_system') + check_dict = torch.load(os.path.join(path_to_checkpoint), map_location=device) + net.load_state_dict(check_dict["model"]) + for datapoint_index in tqdm(range(len(dataset))): + loss = net(text_tensors=dataset[datapoint_index][0].unsqueeze(0).to(device), + text_lengths=dataset[datapoint_index][1].to(device), + gold_speech=dataset[datapoint_index][2].unsqueeze(0).to(device), + speech_lengths=dataset[datapoint_index][3].to(device), + gold_durations=dataset[datapoint_index][4].unsqueeze(0).to(device), + gold_pitch=dataset[datapoint_index][6].unsqueeze(0).to(device), # mind the switched order + gold_energy=dataset[datapoint_index][5].unsqueeze(0).to(device), # mind the switched order + utterance_embedding=dataset[datapoint_index][7].unsqueeze(0).to(device), + lang_ids=None, + return_mels=False).squeeze() + if torch.isnan(loss): + print(f"CAREFUL, NAN DETECTED: {datapoint_index}") + nan_ids.append(datapoint_index) + dataset.remove_samples(nan_ids) diff --git a/Utility/EvaluationScripts/GermanWER.py b/Utility/EvaluationScripts/GermanWER.py deleted file mode 100644 index fb7f8d78..00000000 --- a/Utility/EvaluationScripts/GermanWER.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-german -""" - -import torch -from datasets import load_dataset -from transformers import Wav2Vec2ForCTC -from transformers import Wav2Vec2Processor - -LANG_ID = "de" -MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-german" -SAMPLES = 10 - -test_dataset = load_dataset("common_voice", LANG_ID, split=f"test[:{SAMPLES}]") - -processor = Wav2Vec2Processor.from_pretrained(MODEL_ID) -model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID) - - -# Preprocessing the datasets. -# We need to read the audio files as arrays -def speech_file_to_array_fn(batch): - speech_array = batch["audio"]["array"] - batch["speech"] = speech_array - batch["sentence"] = batch["sentence"].upper() - return batch - - -test_dataset = test_dataset.map(speech_file_to_array_fn) -inputs = processor(test_dataset["speech"], sampling_rate=16_000, return_tensors="pt", padding=True) - -with torch.no_grad(): - logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits - -predicted_ids = torch.argmax(logits, dim=-1) -predicted_sentences = processor.batch_decode(predicted_ids) - -for i, predicted_sentence in enumerate(predicted_sentences): - print("-" * 100) - print("Reference:", test_dataset[i]["sentence"]) - print("Prediction:", predicted_sentence) diff --git a/Utility/EvaluationScripts/RussianWER.py b/Utility/EvaluationScripts/RussianWER.py deleted file mode 100644 index 69e0ed36..00000000 --- a/Utility/EvaluationScripts/RussianWER.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-russian -""" - -import torch -from datasets import load_dataset -from transformers import Wav2Vec2ForCTC -from transformers import Wav2Vec2Processor - -LANG_ID = "ru" -MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-russian" -SAMPLES = 5 - -test_dataset = load_dataset("common_voice", LANG_ID, split=f"test[:{SAMPLES}]") - -processor = Wav2Vec2Processor.from_pretrained(MODEL_ID) -model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID) - - -# Preprocessing the datasets. -# We need to read the audio files as arrays -def speech_file_to_array_fn(batch): - speech_array = batch["audio"]["array"] - batch["speech"] = speech_array - batch["sentence"] = batch["sentence"].upper() - return batch - - -test_dataset = test_dataset.map(speech_file_to_array_fn) - -print(test_dataset["speech"]) - -inputs = processor(test_dataset["speech"], sampling_rate=16_000, return_tensors="pt", padding=True) - -with torch.no_grad(): - logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits - -predicted_ids = torch.argmax(logits, dim=-1) -predicted_sentences = processor.batch_decode(predicted_ids) - -for i, predicted_sentence in enumerate(predicted_sentences): - print("-" * 100) - print("Reference:", test_dataset[i]["sentence"]) - print("Prediction:", predicted_sentence) diff --git a/Utility/path_to_transcript_dicts.py b/Utility/path_to_transcript_dicts.py index 6e2c390f..29cbbee9 100644 --- a/Utility/path_to_transcript_dicts.py +++ b/Utility/path_to_transcript_dicts.py @@ -189,6 +189,41 @@ def build_path_to_transcript_dict_libritts(): return path_to_transcript +def build_path_to_transcript_dict_libritts_other500(): + path_train = "/mount/resources/asr-data/LibriTTS/train-other-500" + path_to_transcript = dict() + for speaker in os.listdir(path_train): + for chapter in os.listdir(os.path.join(path_train, speaker)): + for file in os.listdir(os.path.join(path_train, speaker, chapter)): + if file.endswith("normalized.txt"): + with open(os.path.join(path_train, speaker, chapter, file), 'r', encoding='utf8') as tf: + transcript = tf.read() + wav_file = file.split(".")[0] + ".wav" + path_to_transcript[os.path.join(path_train, speaker, chapter, wav_file)] = transcript + return path_to_transcript + + +def build_path_to_transcript_dict_libritts_asr_other500(label_file): + with open(label_file, encoding="utf8", mode="r") as f: + labels = f.read() + audio_handle_to_transcript = dict() + for line in labels.split("\n"): + if line.strip() == "": + continue + audio_handle_to_transcript[line.split()[0]] = line.lstrip(f"{line.split()[0]} ") + path_train = "/mount/resources/asr-data/LibriTTS/train-other-500" + path_to_transcript = dict() + for speaker in os.listdir(path_train): + for chapter in os.listdir(os.path.join(path_train, speaker)): + for file in os.listdir(os.path.join(path_train, speaker, chapter)): + if file.endswith(".wav"): + try: + path_to_transcript[os.path.join(path_train, speaker, chapter, file)] = audio_handle_to_transcript[file.split(".")[0]] + except KeyError: + print(f"Problem with {file}, no transcription found!") + return path_to_transcript + + def build_path_to_transcript_dict_libritts_asr(label_file): with open(label_file, encoding="utf8", mode="r") as f: labels = f.read() @@ -218,6 +253,14 @@ def build_path_to_transcript_dict_libritts_asr_phn(): return build_path_to_transcript_dict_libritts_asr("/mount/arbeitsdaten45/projekte/asr-4/denisopl/tmp/libritts_train_600_tts-phn-bpe100.txt") +def build_path_to_transcript_dict_libritts_asr_out_500(): + return build_path_to_transcript_dict_libritts_asr_other500("/mount/arbeitsdaten45/projekte/asr-4/denisopl/tmp/libritts_train_600_tts-bpe100.txt") + + +def build_path_to_transcript_dict_libritts_asr_phn_500(): + return build_path_to_transcript_dict_libritts_asr_other500("/mount/arbeitsdaten45/projekte/asr-4/denisopl/tmp/libritts_train_600_tts-phn-bpe100.txt") + + def build_path_to_transcript_dict_ljspeech(): path_to_transcript = dict() for transcript_file in os.listdir("/mount/resources/speech/corpora/LJSpeech/16kHz/txt"): diff --git a/run_training_pipeline.py b/run_training_pipeline.py index 9610394a..70bc717f 100644 --- a/run_training_pipeline.py +++ b/run_training_pipeline.py @@ -6,8 +6,10 @@ from TrainingInterfaces.TrainingPipelines.FastSpeech2_Karlsson import run as karlsson from TrainingInterfaces.TrainingPipelines.FastSpeech2_LJ import run as lj from TrainingInterfaces.TrainingPipelines.FastSpeech2_LibriTTS import run as libri +from TrainingInterfaces.TrainingPipelines.FastSpeech2_LibriTTS_600 import run as libri600 from TrainingInterfaces.TrainingPipelines.FastSpeech2_LibriTTS_asr_out import run as asr_out from TrainingInterfaces.TrainingPipelines.FastSpeech2_LibriTTS_asr_phn import run as asr_phn +from TrainingInterfaces.TrainingPipelines.FastSpeech2_LibriTTS_asr_phn_600 import run as phn600 from TrainingInterfaces.TrainingPipelines.FastSpeech2_MetaCheckpoint import run as meta_fast from TrainingInterfaces.TrainingPipelines.FastSpeech2_MetaCheckpoint_germ_finetune import run as low_ger from TrainingInterfaces.TrainingPipelines.FastSpeech2_MetaCheckpoint_no_Germanic import run as no_ger @@ -34,7 +36,9 @@ "single_rus" : single_rus, "full_ger" : full_ger, "asr_out" : asr_out, - "asr_phn" : asr_phn + "asr_phn" : asr_phn, + "phn600" : phn600, + "libri600" : libri600 } if __name__ == '__main__': From a4c70256134df973e78a6e63623ce61322d82adb Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Fri, 4 Mar 2022 18:23:15 +0100 Subject: [PATCH 13/33] rework meta checkpoint generation to make more sense --- .../FastSpeech2/meta_train_loop.py | 4 +- .../FastSpeech2_MetaCheckpoint.py | 301 ++++++++++-------- 2 files changed, 163 insertions(+), 142 deletions(-) diff --git a/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py index 2db3d976..a79d03d0 100644 --- a/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py +++ b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py @@ -1,3 +1,5 @@ +import random + import librosa.display as lbd import matplotlib.pyplot as plt import torch @@ -84,7 +86,7 @@ def train_loop(net, # ============================= for step in tqdm(range(step_counter, steps)): batches = [] - for index in range(len(datasets)): + for index in random.sample(list(range(len(datasets)))): # we get one batch for each task (i.e. language in this case) try: batch = next(train_iters[index]) diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py index 8e568a24..7f5e2b53 100644 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py +++ b/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py @@ -2,6 +2,7 @@ import torch import torch.multiprocessing +from torch.utils.data import ConcatDataset from tqdm import tqdm from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2 import FastSpeech2 @@ -10,7 +11,7 @@ from Utility.path_to_transcript_dicts import * -def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, find_faulty_samples_mode=False): +def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, remove_faulty_samples=False): torch.manual_seed(131714) random.seed(131714) torch.random.manual_seed(131714) @@ -28,141 +29,166 @@ def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, find_faulty_samp os.makedirs(meta_save_dir, exist_ok=True) print("Preparing") - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_nancy(), - corpus_dir=os.path.join("Corpora", "Nancy"), - lang="en")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_karlsson(), - corpus_dir=os.path.join("Corpora", "Karlsson"), - lang="de")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10el(), - corpus_dir=os.path.join("Corpora", "meta_Greek"), - lang="el")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_spanish_blizzard_train(), - corpus_dir=os.path.join("Corpora", "spanish_blizzard"), - lang="es")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10fi(), - corpus_dir=os.path.join("Corpora", "meta_Finnish"), - lang="fi")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10ru(), - corpus_dir=os.path.join("Corpora", "meta_Russian"), - lang="ru")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10hu(), - corpus_dir=os.path.join("Corpora", "meta_Hungarian"), - lang="hu")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10nl(), - corpus_dir=os.path.join("Corpora", "meta_Dutch"), - lang="nl")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10fr(), - corpus_dir=os.path.join("Corpora", "meta_French"), - lang="fr")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_ljspeech(), - corpus_dir=os.path.join("Corpora", "LJSpeech"), - lang="en")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts(), - corpus_dir=os.path.join("Corpora", "libri"), - lang="en")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_vctk(), - corpus_dir=os.path.join("Corpora", "vctk"), - lang="en")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_nvidia_hifitts(), - corpus_dir=os.path.join("Corpora", "hifi"), - lang="en")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10es(), - corpus_dir=os.path.join("Corpora", "meta_Spanish"), - lang="es")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_eva(), - corpus_dir=os.path.join("Corpora", "Eva"), - lang="de")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_hokus(), - corpus_dir=os.path.join("Corpora", "Hokus"), - lang="de")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_bernd(), - corpus_dir=os.path.join("Corpora", "Bernd"), - lang="de")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_hui_others(), - corpus_dir=os.path.join("Corpora", "hui_others"), - lang="de")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_thorsten(), - corpus_dir=os.path.join("Corpora", "Thorsten"), - lang="de")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_fluxsing(), - corpus_dir=os.path.join("Corpora", "flux_sing"), - lang="en", - ctc_selection=False)) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_portuguese(), - corpus_dir=os.path.join("Corpora", "mls_porto"), - lang="pt")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_polish(), - corpus_dir=os.path.join("Corpora", "mls_polish"), - lang="pl")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_spanish(), - corpus_dir=os.path.join("Corpora", "mls_spanish"), - lang="es")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_french(), - corpus_dir=os.path.join("Corpora", "mls_french"), - lang="fr")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_italian(), - corpus_dir=os.path.join("Corpora", "mls_italian"), - lang="it")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_dutch(), - corpus_dir=os.path.join("Corpora", "mls_dutch"), - lang="nl")) - - if find_faulty_samples_mode: - find_faulty_samples(net=FastSpeech2(lang_embs=100), - datasets=datasets, - device=torch.device("cuda"), - path_to_checkpoint=resume_checkpoint) - else: - train_loop(net=FastSpeech2(lang_embs=100), - device=torch.device("cuda"), - datasets=datasets, - batch_size=3, - save_directory=meta_save_dir, - steps=100000, - steps_per_checkpoint=1000, - lr=0.001, - path_to_checkpoint=resume_checkpoint, - resume=resume) - - -@torch.no_grad() -def find_faulty_samples(net, - datasets, - device, - path_to_checkpoint): + english_datasets = list() + german_datasets = list() + greek_datasets = list() + spanish_datasets = list() + finnish_datasets = list() + russian_datasets = list() + hungarian_datasets = list() + dutch_datasets = list() + french_datasets = list() + portuguese_datasets = list() + polish_datasets = list() + italian_datasets = list() + + english_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_nancy(), + corpus_dir=os.path.join("Corpora", "Nancy"), + lang="en")) + + english_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_fluxsing(), + corpus_dir=os.path.join("Corpora", "flux_sing"), + lang="en", + ctc_selection=False)) + + english_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_ljspeech(), + corpus_dir=os.path.join("Corpora", "LJSpeech"), + lang="en")) + + english_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts(), + corpus_dir=os.path.join("Corpora", "libri"), + lang="en")) + + english_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_vctk(), + corpus_dir=os.path.join("Corpora", "vctk"), + lang="en")) + + english_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_nvidia_hifitts(), + corpus_dir=os.path.join("Corpora", "hifi"), + lang="en")) + + german_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_karlsson(), + corpus_dir=os.path.join("Corpora", "Karlsson"), + lang="de")) + + german_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_eva(), + corpus_dir=os.path.join("Corpora", "Eva"), + lang="de")) + + german_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_hokus(), + corpus_dir=os.path.join("Corpora", "Hokus"), + lang="de")) + + german_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_bernd(), + corpus_dir=os.path.join("Corpora", "Bernd"), + lang="de")) + + german_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_hui_others(), + corpus_dir=os.path.join("Corpora", "hui_others"), + lang="de")) + + german_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_thorsten(), + corpus_dir=os.path.join("Corpora", "Thorsten"), + lang="de")) + + greek_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10el(), + corpus_dir=os.path.join("Corpora", "meta_Greek"), + lang="el")) + + spanish_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_spanish_blizzard_train(), + corpus_dir=os.path.join("Corpora", "spanish_blizzard"), + lang="es")) + + spanish_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10es(), + corpus_dir=os.path.join("Corpora", "meta_Spanish"), + lang="es")) + + spanish_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_spanish(), + corpus_dir=os.path.join("Corpora", "mls_spanish"), + lang="es")) + + finnish_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10fi(), + corpus_dir=os.path.join("Corpora", "meta_Finnish"), + lang="fi")) + + russian_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10ru(), + corpus_dir=os.path.join("Corpora", "meta_Russian"), + lang="ru")) + + hungarian_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10hu(), + corpus_dir=os.path.join("Corpora", "meta_Hungarian"), + lang="hu")) + + dutch_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10nl(), + corpus_dir=os.path.join("Corpora", "meta_Dutch"), + lang="nl")) + + dutch_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_dutch(), + corpus_dir=os.path.join("Corpora", "mls_dutch"), + lang="nl")) + + french_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10fr(), + corpus_dir=os.path.join("Corpora", "meta_French"), + lang="fr")) + + french_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_french(), + corpus_dir=os.path.join("Corpora", "mls_french"), + lang="fr")) + + portuguese_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_portuguese(), + corpus_dir=os.path.join("Corpora", "mls_porto"), + lang="pt")) + + polish_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_polish(), + corpus_dir=os.path.join("Corpora", "mls_polish"), + lang="pl")) + + italian_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_italian(), + corpus_dir=os.path.join("Corpora", "mls_italian"), + lang="it")) + + datasets.append(ConcatDataset(english_datasets)) + datasets.append(ConcatDataset(german_datasets)) + datasets.append(ConcatDataset(greek_datasets)) + datasets.append(ConcatDataset(spanish_datasets)) + datasets.append(ConcatDataset(finnish_datasets)) + datasets.append(ConcatDataset(russian_datasets)) + datasets.append(ConcatDataset(hungarian_datasets)) + datasets.append(ConcatDataset(dutch_datasets)) + datasets.append(ConcatDataset(french_datasets)) + datasets.append(ConcatDataset(portuguese_datasets)) + datasets.append(ConcatDataset(polish_datasets)) + datasets.append(ConcatDataset(italian_datasets)) + + if remove_faulty_samples: + find_and_remove_faulty_samples(net=FastSpeech2(lang_embs=100), + datasets=datasets, + device=torch.device("cuda"), + path_to_checkpoint=resume_checkpoint) + + train_loop(net=FastSpeech2(lang_embs=100), + device=torch.device("cuda"), + datasets=datasets, + batch_size=5, + save_directory=meta_save_dir, + steps=100000, + steps_per_checkpoint=1000, + lr=0.001, + path_to_checkpoint=resume_checkpoint, + resume=resume) + + +@torch.inference_mode() +def find_and_remove_faulty_samples(net, + datasets, + device, + path_to_checkpoint): net = net.to(device) torch.multiprocessing.set_sharing_strategy('file_system') check_dict = torch.load(os.path.join(path_to_checkpoint), map_location=device) net.load_state_dict(check_dict["model"]) - losses = list() - index_pairs = list() for dataset_index in range(len(datasets)): + nan_ids = list() for datapoint_index in tqdm(range(len(datasets[dataset_index]))): loss = net(text_tensors=datasets[dataset_index][datapoint_index][0].unsqueeze(0).to(device), text_lengths=datasets[dataset_index][datapoint_index][1].to(device), @@ -175,13 +201,6 @@ def find_faulty_samples(net, lang_ids=datasets[dataset_index][datapoint_index][8].unsqueeze(0).to(device), return_mels=False).squeeze() if torch.isnan(loss): - print(f"CAREFUL, NAN DETECTED: {dataset_index}, {datapoint_index}") - losses.append(loss.item()) - index_pairs.append((dataset_index, datapoint_index)) - loss_high_to_low = sorted(losses, reverse=True) - print(loss_high_to_low) - threshold = loss_high_to_low[1000] - for index, loss in enumerate(losses): - if loss > threshold: - print(index_pairs[index]) - print(loss) + print(f"NAN DETECTED: {dataset_index}, {datapoint_index}") + nan_ids.append(datapoint_index) + datasets[dataset_index].remove_samples(nan_ids) From c48b66162e94247586513a17538f8d0d07592367 Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Sat, 5 Mar 2022 20:39:36 +0100 Subject: [PATCH 14/33] add another dataset to the meta pipeline --- .../FastSpeech2_MetaCheckpoint.py | 11 +++++++++-- Utility/path_to_transcript_dicts.py | 14 ++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py index 7f5e2b53..6a23cda1 100644 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py +++ b/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py @@ -28,6 +28,13 @@ def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, remove_faulty_sa meta_save_dir = base_dir os.makedirs(meta_save_dir, exist_ok=True) + find_and_remove_faulty_samples(net=FastSpeech2(lang_embs=100), + datasets=[prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts_all_clean(), + corpus_dir=os.path.join("Corpora", "libri_all_clean"), + lang="en")], + device=torch.device("cuda"), + path_to_checkpoint=resume_checkpoint) + print("Preparing") english_datasets = list() german_datasets = list() @@ -55,8 +62,8 @@ def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, remove_faulty_sa corpus_dir=os.path.join("Corpora", "LJSpeech"), lang="en")) - english_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts(), - corpus_dir=os.path.join("Corpora", "libri"), + english_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts_all_clean(), + corpus_dir=os.path.join("Corpora", "libri_all_clean"), lang="en")) english_datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_vctk(), diff --git a/Utility/path_to_transcript_dicts.py b/Utility/path_to_transcript_dicts.py index 29cbbee9..b3cfd4ed 100644 --- a/Utility/path_to_transcript_dicts.py +++ b/Utility/path_to_transcript_dicts.py @@ -189,6 +189,20 @@ def build_path_to_transcript_dict_libritts(): return path_to_transcript +def build_path_to_transcript_dict_libritts_all_clean(): + path_train = "/mount/resources/speech/corpora/LibriTTS/all-clean" + path_to_transcript = dict() + for speaker in os.listdir(path_train): + for chapter in os.listdir(os.path.join(path_train, speaker)): + for file in os.listdir(os.path.join(path_train, speaker, chapter)): + if file.endswith("normalized.txt"): + with open(os.path.join(path_train, speaker, chapter, file), 'r', encoding='utf8') as tf: + transcript = tf.read() + wav_file = file.split(".")[0] + ".wav" + path_to_transcript[os.path.join(path_train, speaker, chapter, wav_file)] = transcript + return path_to_transcript + + def build_path_to_transcript_dict_libritts_other500(): path_train = "/mount/resources/asr-data/LibriTTS/train-other-500" path_to_transcript = dict() From e8bd4c0bfec212f1f52675c874766320b5970dcb Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Sat, 5 Mar 2022 20:40:35 +0100 Subject: [PATCH 15/33] go back to regular training mode --- .../TrainingPipelines/FastSpeech2_MetaCheckpoint.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py index 6a23cda1..48e06849 100644 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py +++ b/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py @@ -28,13 +28,6 @@ def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, remove_faulty_sa meta_save_dir = base_dir os.makedirs(meta_save_dir, exist_ok=True) - find_and_remove_faulty_samples(net=FastSpeech2(lang_embs=100), - datasets=[prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts_all_clean(), - corpus_dir=os.path.join("Corpora", "libri_all_clean"), - lang="en")], - device=torch.device("cuda"), - path_to_checkpoint=resume_checkpoint) - print("Preparing") english_datasets = list() german_datasets = list() From 7f5fa9f1a2f683f0cd781ec692763a13428477e1 Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Mon, 7 Mar 2022 14:41:22 +0100 Subject: [PATCH 16/33] add crude noisereduction --- InferenceInterfaces/InferenceFastSpeech2.py | 10 +++++++++- Utility/path_to_transcript_dicts.py | 2 +- requirements.txt | 4 +--- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/InferenceInterfaces/InferenceFastSpeech2.py b/InferenceInterfaces/InferenceFastSpeech2.py index 77a5a54c..78bd941a 100644 --- a/InferenceInterfaces/InferenceFastSpeech2.py +++ b/InferenceInterfaces/InferenceFastSpeech2.py @@ -3,6 +3,7 @@ import librosa.display as lbd import matplotlib.pyplot as plt +import noisereduce import sounddevice import soundfile import torch @@ -16,7 +17,7 @@ class InferenceFastSpeech2(torch.nn.Module): - def __init__(self, device="cpu", model_name="Meta", language="en"): + def __init__(self, device="cpu", model_name="Meta", language="en", noise_reduce=True): super().__init__() self.device = device self.text2phone = ArticulatoryCombinedTextFrontend(language=language, add_silence_to_end=True) @@ -34,6 +35,11 @@ def __init__(self, device="cpu", model_name="Meta", language="en"): self.mel2wav.eval() self.lang_id = get_language_id(language) self.to(torch.device(device)) + self.noise_reduce = noise_reduce + if self.noise_reduce: + self.noise_reduce = False + self.prototypical_noise = self("~." * 100, input_is_phones=True).cpu().numpy() + self.noise_reduce = True def set_utterance_embedding(self, path_to_reference_audio): wave, sr = soundfile.read(path_to_reference_audio) @@ -78,6 +84,8 @@ def forward(self, text, view=False, durations=None, pitch=None, energy=None, inp ax[0].set_title(text) plt.subplots_adjust(left=0.05, bottom=0.1, right=0.95, top=.9, wspace=0.0, hspace=0.0) plt.show() + if self.noise_reduce: + wave = torch.tensor(noisereduce.reduce_noise(y=wave, y_noise=self.prototypical_noise, sr=48000, stationary=True)) return wave def read_to_file(self, text_list, file_location, silent=False, dur_list=None, pitch_list=None, energy_list=None): diff --git a/Utility/path_to_transcript_dicts.py b/Utility/path_to_transcript_dicts.py index b3cfd4ed..6a70aa60 100644 --- a/Utility/path_to_transcript_dicts.py +++ b/Utility/path_to_transcript_dicts.py @@ -190,7 +190,7 @@ def build_path_to_transcript_dict_libritts(): def build_path_to_transcript_dict_libritts_all_clean(): - path_train = "/mount/resources/speech/corpora/LibriTTS/all-clean" + path_train = "/mount/resources/speech/corpora/LibriTTS/all_clean" path_to_transcript = dict() for speaker in os.listdir(path_train): for chapter in os.listdir(os.path.join(path_train, speaker)): diff --git a/requirements.txt b/requirements.txt index 23ab8e73..e7361eac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -80,6 +80,4 @@ uritemplate~=3.0.1 urllib3~=1.26.6 wcwidth~=0.2.5 wincertstore~=0.2 -gdown -transformers -datasets \ No newline at end of file +noisereduce \ No newline at end of file From b676d4bdfc91a051d46672290b6bc4c53a339198 Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Mon, 7 Mar 2022 18:02:57 +0100 Subject: [PATCH 17/33] fix devices --- InferenceInterfaces/InferenceFastSpeech2.py | 12 ++++++++---- .../TrainingPipelines/FastSpeech2_LibriTTS_600.py | 2 -- .../FastSpeech2_LibriTTS_asr_phn_600.py | 2 -- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/InferenceInterfaces/InferenceFastSpeech2.py b/InferenceInterfaces/InferenceFastSpeech2.py index 78bd941a..e0501cae 100644 --- a/InferenceInterfaces/InferenceFastSpeech2.py +++ b/InferenceInterfaces/InferenceFastSpeech2.py @@ -85,14 +85,18 @@ def forward(self, text, view=False, durations=None, pitch=None, energy=None, inp plt.subplots_adjust(left=0.05, bottom=0.1, right=0.95, top=.9, wspace=0.0, hspace=0.0) plt.show() if self.noise_reduce: - wave = torch.tensor(noisereduce.reduce_noise(y=wave, y_noise=self.prototypical_noise, sr=48000, stationary=True)) + wave = torch.tensor(noisereduce.reduce_noise(y=wave, y_noise=self.prototypical_noise, sr=48000, stationary=True), device=self.device) return wave def read_to_file(self, text_list, file_location, silent=False, dur_list=None, pitch_list=None, energy_list=None): """ - :param silent: Whether to be verbose about the process - :param text_list: A list of strings to be read - :param file_location: The path and name of the file it should be saved to + Args: + silent: Whether to be verbose about the process + text_list: A list of strings to be read + file_location: The path and name of the file it should be saved to + energy_list: list of energy tensors to be used for the texts + pitch_list: list of pitch tensors to be used for the texts + dur_list: list of duration tensors to be used for the texts """ if not dur_list: dur_list = [] diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_600.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_600.py index a370528d..a91eb5d2 100644 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_600.py +++ b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_600.py @@ -40,8 +40,6 @@ def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): model = FastSpeech2(lang_embs=None) - find_faulty_samples(model, train_set, device, "Models/FastSpeech2_LibriTTS/best.pt") - train_sets = list() train_sets.append(train_set) train_sets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts(), diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn_600.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn_600.py index d2863801..c29217f0 100644 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn_600.py +++ b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn_600.py @@ -40,8 +40,6 @@ def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): model = FastSpeech2(lang_embs=None) - find_faulty_samples(model, train_set, device, "Models/FastSpeech2_LibriTTS_asr_phn/best.pt") - train_sets = list() train_sets.append(train_set) train_sets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts_asr_phn(), From ef86ae5b6e8da204919d80eb825bc5bd5fd49e4c Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Mon, 7 Mar 2022 19:17:27 +0100 Subject: [PATCH 18/33] fix meta train loop --- .../Text_to_Spectrogram/FastSpeech2/meta_train_loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py index a79d03d0..a05edaf9 100644 --- a/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py +++ b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py @@ -86,7 +86,7 @@ def train_loop(net, # ============================= for step in tqdm(range(step_counter, steps)): batches = [] - for index in random.sample(list(range(len(datasets)))): + for index in random.sample(list(range(len(datasets))), len(datasets)): # we get one batch for each task (i.e. language in this case) try: batch = next(train_iters[index]) From 63c43b49ef9727467426535477974b486bef8f96 Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Mon, 7 Mar 2022 23:58:03 +0100 Subject: [PATCH 19/33] implement mel cepstral distortion with warping --- Utility/EvaluationScripts/audio_vs_audio.py | 92 ++++++++++++++++++++- run_evaluation.py | 3 + 2 files changed, 94 insertions(+), 1 deletion(-) create mode 100644 run_evaluation.py diff --git a/Utility/EvaluationScripts/audio_vs_audio.py b/Utility/EvaluationScripts/audio_vs_audio.py index cfd1ce29..27e66782 100644 --- a/Utility/EvaluationScripts/audio_vs_audio.py +++ b/Utility/EvaluationScripts/audio_vs_audio.py @@ -1,6 +1,96 @@ """ -mel cepstral distortion (with and without DTW to see if the alignment is good (low difference means good alignment)) https://github.com/MattShannon/mcd gross pitch error +Where pt,p +0 +t +are the pitch signals from the reference +and predicted audio, vt,v +0 +t +are the voicing decisions +from the reference and predicted audio, and 1 is the +indicator function. The GPE measures the percentage +of voiced frames that deviate in pitch by more than +20% compared to the reference. + + + + voicing decision error +Where vt,v +0 +t +are the voicing decisions for the reference +and predicted audio, T is the total number of frames, +and 1 is the indicator function. + + + + f0 frame error +FFE measures the percentage of frames that either contain a 20% pitch error (according to GPE) or a voicing decision error (according to VDE). + """ + +import librosa +import numpy +import soundfile as sf +from numpy import inf +from numpy import ndim +from numpy import zeros +from scipy.spatial.distance import cdist +from sklearn.metrics import mean_squared_error + + +def mcd_with_warping(path_1, path_2): + """ + calculate mel cepstral distortion between two unaligned sequences by first performing alignment with warping and then calculating the MSE between them. + + DTW takes an insane amount of RAM if you're not careful with sequence lengths + """ + wave_1, sr_1 = sf.read(path_1) + wave_2, sr_2 = sf.read(path_2) + spec_1 = logmelfilterbank(audio=wave_1, sampling_rate=sr_1) + spec_2 = logmelfilterbank(audio=wave_2, sampling_rate=sr_2) + dist, _, _ = dtw(spec_1, spec_2, mean_squared_error) + return dist + + +def dtw(x, y, dist, warp=1): + """ + https://github.com/pierre-rouanet/dtw/blob/master/dtw/dtw.py + """ + assert len(x) + assert len(y) + if ndim(x) == 1: + x = x.reshape(-1, 1) + if ndim(y) == 1: + y = y.reshape(-1, 1) + r, c = len(x), len(y) + D0 = zeros((r + 1, c + 1)) + D0[0, 1:] = inf + D0[1:, 0] = inf + D1 = D0[1:, 1:] + print("calculating alignment...") + D0[1:, 1:] = cdist(x, y, dist) + C = D1.copy() + for i in range(r): + for j in range(c): + min_list = [D0[i, j]] + for k in range(1, warp + 1): + min_list += [D0[min(i + k, r), j], + D0[i, min(j + k, c)]] + D1[i, j] += min(min_list) + return D1[-1, -1], C, D1 + + +def logmelfilterbank(audio, sampling_rate, fmin=40, fmax=8000, eps=1e-10): + # get amplitude spectrogram + x_stft = librosa.stft(audio, n_fft=1024, hop_length=256, win_length=None, window="hann", pad_mode="reflect") + spc = numpy.abs(x_stft).T + # get mel basis + fmin = 0 if fmin is None else fmin + fmax = sampling_rate / 2 if fmax is None else fmax + mel_basis = librosa.filters.mel(sampling_rate, 1024, 80, fmin, fmax) + # apply log and return + return numpy.log10(numpy.maximum(eps, numpy.dot(spc, mel_basis.T))) diff --git a/run_evaluation.py b/run_evaluation.py new file mode 100644 index 00000000..27314746 --- /dev/null +++ b/run_evaluation.py @@ -0,0 +1,3 @@ +from Utility.EvaluationScripts.audio_vs_audio import mcd_with_warping + +print(mcd_with_warping("audios/test.wav", "audios/test_cloned.wav")) From 3c89a7de7e07677009a0a7f645ab3a06252cd3fe Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Tue, 8 Mar 2022 20:33:16 +0100 Subject: [PATCH 20/33] implement ffe, gpe, vde --- InferenceInterfaces/InferenceFastSpeech2.py | 12 +- Utility/EvaluationScripts/audio_vs_audio.py | 136 +++++++++++++++----- run_evaluation.py | 64 ++++++++- run_utterance_cloner.py | 2 +- 4 files changed, 175 insertions(+), 39 deletions(-) diff --git a/InferenceInterfaces/InferenceFastSpeech2.py b/InferenceInterfaces/InferenceFastSpeech2.py index e0501cae..ce5af9e5 100644 --- a/InferenceInterfaces/InferenceFastSpeech2.py +++ b/InferenceInterfaces/InferenceFastSpeech2.py @@ -37,13 +37,19 @@ def __init__(self, device="cpu", model_name="Meta", language="en", noise_reduce= self.to(torch.device(device)) self.noise_reduce = noise_reduce if self.noise_reduce: - self.noise_reduce = False - self.prototypical_noise = self("~." * 100, input_is_phones=True).cpu().numpy() - self.noise_reduce = True + self.prototypical_noise = None + self.update_noise_profile() def set_utterance_embedding(self, path_to_reference_audio): wave, sr = soundfile.read(path_to_reference_audio) self.default_utterance_embedding = ProsodicConditionExtractor(sr=sr).extract_condition_from_reference_wave(wave).to(self.device) + if self.noise_reduce: + self.update_noise_profile() + + def update_noise_profile(self): + self.noise_reduce = False + self.prototypical_noise = self("~." * 100, input_is_phones=True).cpu().numpy() + self.noise_reduce = True def set_language(self, lang_id): """ diff --git a/Utility/EvaluationScripts/audio_vs_audio.py b/Utility/EvaluationScripts/audio_vs_audio.py index 27e66782..4ef108b8 100644 --- a/Utility/EvaluationScripts/audio_vs_audio.py +++ b/Utility/EvaluationScripts/audio_vs_audio.py @@ -1,51 +1,80 @@ -""" -gross pitch error -Where pt,p -0 -t -are the pitch signals from the reference -and predicted audio, vt,v -0 -t -are the voicing decisions -from the reference and predicted audio, and 1 is the -indicator function. The GPE measures the percentage -of voiced frames that deviate in pitch by more than -20% compared to the reference. +import librosa +import matplotlib.pyplot as plt +import numpy +import soundfile as sf +from numpy import inf +from numpy import ndim +from numpy import zeros +from scipy.spatial.distance import cdist +from sklearn.metrics import mean_squared_error +from Preprocessing.AudioPreprocessor import AudioPreprocessor +from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.PitchCalculator import Dio +def vde(path_1, path_2): + """ + Voicing Decision Error measures the inverted accuracy of frames that are voiced compared to the reference. -voicing decision error -Where vt,v -0 -t -are the voicing decisions for the reference -and predicted audio, T is the total number of frames, -and 1 is the indicator function. + The first path should lead to the 'gold' audio + """ + pitchcurve_1, pitchcurve_2 = get_pitch_curves(path_1, path_2) + correct_frames, incorrect_frames = list(), list() + for index in range(len(pitchcurve_1)): + if (pitchcurve_1[index] == 0.0 and pitchcurve_2[index] != 0.0) or (pitchcurve_1[index] != 0.0 and pitchcurve_2[index] == 0.0): + incorrect_frames.append(index) + else: + correct_frames.append(index) + return len(incorrect_frames) / (len(correct_frames) + len(incorrect_frames)) -f0 frame error -FFE measures the percentage of frames that either contain a 20% pitch error (according to GPE) or a voicing decision error (according to VDE). +def gpe(path_1, path_2): + """ + Gross Pitch Error measures the percentage of voiced frames that deviate in pitch by more than 20% compared to the reference. -""" + The first path should lead to the 'gold' audio + """ + pitchcurve_1, pitchcurve_2 = get_pitch_curves(path_1, path_2) -import librosa -import numpy -import soundfile as sf -from numpy import inf -from numpy import ndim -from numpy import zeros -from scipy.spatial.distance import cdist -from sklearn.metrics import mean_squared_error + correct_frames, incorrect_frames = list(), list() + for index in range(len(pitchcurve_1)): + twenty_percent_deviation = pitchcurve_1[index] * 0.2 # 20% deviation is acceptable + if pitchcurve_1[index] + twenty_percent_deviation > pitchcurve_2[index] > pitchcurve_1[index] - twenty_percent_deviation: + correct_frames.append(index) + else: + incorrect_frames.append(index) + + return len(incorrect_frames) / (len(correct_frames) + len(incorrect_frames)) + + +def ffe(path_1, path_2): + """ + F0 Frame Error measures the percentage of frames that either contain a 20% pitch error (according to GPE) or a voicing decision error (according to VDE). + + The first path should lead to the 'gold' audio + """ + pitchcurve_1, pitchcurve_2 = get_pitch_curves(path_1, path_2) + + correct_frames, incorrect_frames = set(), set() + for index in range(len(pitchcurve_1)): + twenty_percent_deviation = pitchcurve_1[index] * 0.2 # 20% deviation is acceptable + if (pitchcurve_1[index] + twenty_percent_deviation > pitchcurve_2[index] > pitchcurve_1[index] - twenty_percent_deviation) and not ( + (pitchcurve_1[index] == 0.0 and pitchcurve_2[index] != 0.0) or (pitchcurve_1[index] != 0.0 and pitchcurve_2[index] == 0.0)): + correct_frames.add(index) + else: + incorrect_frames.add(index) + + return len(incorrect_frames) / (len(correct_frames) + len(incorrect_frames)) def mcd_with_warping(path_1, path_2): """ calculate mel cepstral distortion between two unaligned sequences by first performing alignment with warping and then calculating the MSE between them. + The two audios have to be spoken by the same speaker for it to make sense. + DTW takes an insane amount of RAM if you're not careful with sequence lengths """ wave_1, sr_1 = sf.read(path_1) @@ -53,12 +82,12 @@ def mcd_with_warping(path_1, path_2): spec_1 = logmelfilterbank(audio=wave_1, sampling_rate=sr_1) spec_2 = logmelfilterbank(audio=wave_2, sampling_rate=sr_2) dist, _, _ = dtw(spec_1, spec_2, mean_squared_error) - return dist + return dist / len(spec_1) def dtw(x, y, dist, warp=1): """ - https://github.com/pierre-rouanet/dtw/blob/master/dtw/dtw.py + https://github.com/pierre-rouanet/dtw/blob/master/dtw/dtw.py """ assert len(x) assert len(y) @@ -94,3 +123,42 @@ def logmelfilterbank(audio, sampling_rate, fmin=40, fmax=8000, eps=1e-10): mel_basis = librosa.filters.mel(sampling_rate, 1024, 80, fmin, fmax) # apply log and return return numpy.log10(numpy.maximum(eps, numpy.dot(spc, mel_basis.T))) + + +def get_pitch_curves(path_1, path_2, plot_curves=False, length_norm=True): + wave_1, sr_1 = sf.read(path_1) + wave_2, sr_2 = sf.read(path_2) + + ap_1 = AudioPreprocessor(cut_silence=True, input_sr=sr_1, output_sr=16000) + ap_2 = AudioPreprocessor(cut_silence=True, input_sr=sr_2, output_sr=16000) + + norm_wave_1 = ap_1.audio_to_wave_tensor(wave_1, normalize=True) + norm_wave_2 = ap_2.audio_to_wave_tensor(wave_2, normalize=True) + + dio = Dio(fs=16000, use_token_averaged_f0=False, use_log_f0=False, use_continuous_f0=False) + + pitch_curve_1 = dio(norm_wave_1.unsqueeze(0), norm_by_average=False)[0].squeeze() + pitch_curve_2 = dio(norm_wave_2.unsqueeze(0), norm_by_average=False)[0].squeeze() + + if length_norm: + # symmetrically remove samples from front and back so we end up with the same amount of frames in both + toggle = True + while len(pitch_curve_1) > len(pitch_curve_2): + if toggle: + pitch_curve_1 = pitch_curve_1[1:] + else: + pitch_curve_1 = pitch_curve_1[:-1] + toggle = not toggle + while len(pitch_curve_1) < len(pitch_curve_2): + if toggle: + pitch_curve_2 = pitch_curve_2[1:] + else: + pitch_curve_2 = pitch_curve_2[:-1] + toggle = not toggle + + if plot_curves: + plt.plot(pitch_curve_1, c="red") + plt.plot(pitch_curve_2, c="blue") + plt.show() + + return pitch_curve_1, pitch_curve_2 diff --git a/run_evaluation.py b/run_evaluation.py index 27314746..104c0960 100644 --- a/run_evaluation.py +++ b/run_evaluation.py @@ -1,3 +1,65 @@ +import os + +from tqdm import tqdm + +from Utility.EvaluationScripts.audio_vs_audio import ffe +from Utility.EvaluationScripts.audio_vs_audio import gpe from Utility.EvaluationScripts.audio_vs_audio import mcd_with_warping +from Utility.EvaluationScripts.audio_vs_audio import vde + +ffe(f"audios/adept/human/1.wav", f"audios/adept/same_voice_same_style/1.wav") +gpe(f"audios/adept/human/1.wav", f"audios/adept/same_voice_same_style/1.wav") +vde(f"audios/adept/human/1.wav", f"audios/adept/same_voice_same_style/1.wav") + +mcd_same_style = list() +mcd_diff_style = list() + +ffe_same_style = list() +ffe_diff_style = list() + +gpe_same_style = list() +gpe_diff_style = list() + +vde_same_style = list() +vde_diff_style = list() + +for file in tqdm(os.listdir("audios/adept/human")): + if file.endswith(".wav"): + mcd_same_style.append(mcd_with_warping(f"audios/adept/human/{file}", f"audios/adept/same_voice_same_style/{file}")) + # vde_same_style.append(vde(f"audios/adept/human/{file}", f"audios/adept/same_voice_same_style/{file}")) + # gpe_same_style.append(gpe(f"audios/adept/human/{file}", f"audios/adept/same_voice_same_style/{file}")) + # ffe_same_style.append(ffe(f"audios/adept/human/{file}", f"audios/adept/same_voice_same_style/{file}")) + + mcd_diff_style.append(mcd_with_warping(f"audios/adept/human/{file}", f"audios/adept/same_voice_diff_style/{file}")) + # vde_diff_style.append(vde(f"audios/adept/human/{file}", f"audios/adept/same_voice_diff_style/{file}")) + # gpe_diff_style.append(gpe(f"audios/adept/human/{file}", f"audios/adept/same_voice_diff_style/{file}")) + # ffe_diff_style.append(ffe(f"audios/adept/human/{file}", f"audios/adept/same_voice_diff_style/{file}")) + +print(mcd_same_style) +# print(vde_same_style) +# print(gpe_same_style) +# print(ffe_same_style) + +print(mcd_diff_style) +# print(vde_diff_style) +# print(gpe_diff_style) +# print(ffe_diff_style) + +print((1 / len(mcd_same_style)) * sum(mcd_same_style)) +# print((1 / len(vde_same_style)) * sum(vde_same_style)) +# print((1 / len(gpe_same_style)) * sum(gpe_same_style)) +# print((1 / len(ffe_same_style)) * sum(ffe_same_style)) + +print((1 / len(mcd_diff_style)) * sum(mcd_diff_style)) +# print((1 / len(vde_diff_style)) * sum(vde_diff_style)) +# print((1 / len(gpe_diff_style)) * sum(gpe_diff_style)) +# print((1 / len(ffe_diff_style)) * sum(ffe_diff_style)) -print(mcd_with_warping("audios/test.wav", "audios/test_cloned.wav")) +""" +0.3197801087172897 +0.6701621670474431 +0.6701621670474431 +0.4236358253409133 +0.7842741112107358 +0.7842741112107358 +""" diff --git a/run_utterance_cloner.py b/run_utterance_cloner.py index 6cb17081..2e31fcbd 100644 --- a/run_utterance_cloner.py +++ b/run_utterance_cloner.py @@ -130,7 +130,7 @@ def clone_utterance(self, uc.clone_utterance(path_to_reference_audio="audios/test.wav", reference_transcription="Hello world, this is a test.", filename_of_result="audios/test_cloned.wav", - clone_speaker_identity=False, + clone_speaker_identity=True, lang="en") uc.clone_utterance(path_to_reference_audio="audios/test.wav", From e614a4221051694a77502eca1239747ac7477e1a Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Tue, 8 Mar 2022 22:40:16 +0100 Subject: [PATCH 21/33] add english pipeline --- .../TrainingPipelines/FastSpeech2_English.py | 73 +++++++++++++++++++ run_evaluation.py | 36 ++++----- run_training_pipeline.py | 4 +- 3 files changed, 94 insertions(+), 19 deletions(-) create mode 100644 TrainingInterfaces/TrainingPipelines/FastSpeech2_English.py diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_English.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_English.py new file mode 100644 index 00000000..079371dc --- /dev/null +++ b/TrainingInterfaces/TrainingPipelines/FastSpeech2_English.py @@ -0,0 +1,73 @@ +import random + +import torch +from torch.utils.data import ConcatDataset + +from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2 import FastSpeech2 +from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.fastspeech2_train_loop import train_loop +from Utility.corpus_preparation import prepare_fastspeech_corpus +from Utility.path_to_transcript_dicts import * + + +def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): + if gpu_id == "cpu": + os.environ["CUDA_VISIBLE_DEVICES"] = "" + device = torch.device("cpu") + + else: + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" + os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}" + device = torch.device("cuda") + + torch.manual_seed(131714) + random.seed(131714) + torch.random.manual_seed(131714) + + print("Preparing") + + if model_dir is not None: + save_dir = model_dir + else: + save_dir = os.path.join("Models", "FastSpeech2_English") + os.makedirs(save_dir, exist_ok=True) + + datasets = list() + datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_nancy(), + corpus_dir=os.path.join("Corpora", "Nancy"), + lang="en")) + + datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_ljspeech(), + corpus_dir=os.path.join("Corpora", "LJSpeech"), + lang="en")) + + datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts_all_clean(), + corpus_dir=os.path.join("Corpora", "libri_all_clean"), + lang="en")) + + datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_vctk(), + corpus_dir=os.path.join("Corpora", "vctk"), + lang="en")) + + datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_nvidia_hifitts(), + corpus_dir=os.path.join("Corpora", "hifi"), + lang="en")) + + train_set = ConcatDataset(datasets) + + model = FastSpeech2(lang_embs=100) + # because we want to finetune it, we treat it as multilingual, even though we are only interested in German here + + print("Training model") + train_loop(net=model, + train_dataset=train_set, + device=device, + save_directory=save_dir, + steps=500000, + batch_size=10, + lang="en", + lr=0.001, + epochs_per_save=1, + warmup_steps=4000, + path_to_checkpoint="Models/FastSpeech2_Meta/best.pt", + fine_tune=True, + resume=resume) diff --git a/run_evaluation.py b/run_evaluation.py index 104c0960..ab417080 100644 --- a/run_evaluation.py +++ b/run_evaluation.py @@ -26,34 +26,34 @@ for file in tqdm(os.listdir("audios/adept/human")): if file.endswith(".wav"): mcd_same_style.append(mcd_with_warping(f"audios/adept/human/{file}", f"audios/adept/same_voice_same_style/{file}")) - # vde_same_style.append(vde(f"audios/adept/human/{file}", f"audios/adept/same_voice_same_style/{file}")) - # gpe_same_style.append(gpe(f"audios/adept/human/{file}", f"audios/adept/same_voice_same_style/{file}")) - # ffe_same_style.append(ffe(f"audios/adept/human/{file}", f"audios/adept/same_voice_same_style/{file}")) + vde_same_style.append(vde(f"audios/adept/human/{file}", f"audios/adept/same_voice_same_style/{file}")) + gpe_same_style.append(gpe(f"audios/adept/human/{file}", f"audios/adept/same_voice_same_style/{file}")) + ffe_same_style.append(ffe(f"audios/adept/human/{file}", f"audios/adept/same_voice_same_style/{file}")) mcd_diff_style.append(mcd_with_warping(f"audios/adept/human/{file}", f"audios/adept/same_voice_diff_style/{file}")) - # vde_diff_style.append(vde(f"audios/adept/human/{file}", f"audios/adept/same_voice_diff_style/{file}")) - # gpe_diff_style.append(gpe(f"audios/adept/human/{file}", f"audios/adept/same_voice_diff_style/{file}")) - # ffe_diff_style.append(ffe(f"audios/adept/human/{file}", f"audios/adept/same_voice_diff_style/{file}")) + vde_diff_style.append(vde(f"audios/adept/human/{file}", f"audios/adept/same_voice_diff_style/{file}")) + gpe_diff_style.append(gpe(f"audios/adept/human/{file}", f"audios/adept/same_voice_diff_style/{file}")) + ffe_diff_style.append(ffe(f"audios/adept/human/{file}", f"audios/adept/same_voice_diff_style/{file}")) print(mcd_same_style) -# print(vde_same_style) -# print(gpe_same_style) -# print(ffe_same_style) +print(vde_same_style) +print(gpe_same_style) +print(ffe_same_style) print(mcd_diff_style) -# print(vde_diff_style) -# print(gpe_diff_style) -# print(ffe_diff_style) +print(vde_diff_style) +print(gpe_diff_style) +print(ffe_diff_style) print((1 / len(mcd_same_style)) * sum(mcd_same_style)) -# print((1 / len(vde_same_style)) * sum(vde_same_style)) -# print((1 / len(gpe_same_style)) * sum(gpe_same_style)) -# print((1 / len(ffe_same_style)) * sum(ffe_same_style)) +print((1 / len(vde_same_style)) * sum(vde_same_style)) +print((1 / len(gpe_same_style)) * sum(gpe_same_style)) +print((1 / len(ffe_same_style)) * sum(ffe_same_style)) print((1 / len(mcd_diff_style)) * sum(mcd_diff_style)) -# print((1 / len(vde_diff_style)) * sum(vde_diff_style)) -# print((1 / len(gpe_diff_style)) * sum(gpe_diff_style)) -# print((1 / len(ffe_diff_style)) * sum(ffe_diff_style)) +print((1 / len(vde_diff_style)) * sum(vde_diff_style)) +print((1 / len(gpe_diff_style)) * sum(gpe_diff_style)) +print((1 / len(ffe_diff_style)) * sum(ffe_diff_style)) """ 0.3197801087172897 diff --git a/run_training_pipeline.py b/run_training_pipeline.py index 70bc717f..a84409cc 100644 --- a/run_training_pipeline.py +++ b/run_training_pipeline.py @@ -1,6 +1,7 @@ import argparse import sys +from TrainingInterfaces.TrainingPipelines.FastSpeech2_English import run as english from TrainingInterfaces.TrainingPipelines.FastSpeech2_German import run as full_ger from TrainingInterfaces.TrainingPipelines.FastSpeech2_GermanSingle import run as single_ger from TrainingInterfaces.TrainingPipelines.FastSpeech2_Karlsson import run as karlsson @@ -38,7 +39,8 @@ "asr_out" : asr_out, "asr_phn" : asr_phn, "phn600" : phn600, - "libri600" : libri600 + "libri600" : libri600, + "english" : english } if __name__ == '__main__': From 62a6c23bd5428a9244ad1959d20077bf84ec4e28 Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Thu, 10 Mar 2022 14:30:46 +0100 Subject: [PATCH 22/33] finish automatic evaluation scripts --- Preprocessing/AudioPreprocessor.py | 7 ++- Utility/EvaluationScripts/audio_vs_audio.py | 60 ++++++++++++++++++++- run_evaluation.py | 10 ++-- run_utterance_cloner.py | 3 +- 4 files changed, 72 insertions(+), 8 deletions(-) diff --git a/Preprocessing/AudioPreprocessor.py b/Preprocessing/AudioPreprocessor.py index 5adaae5d..da298605 100644 --- a/Preprocessing/AudioPreprocessor.py +++ b/Preprocessing/AudioPreprocessor.py @@ -11,7 +11,7 @@ class AudioPreprocessor: - def __init__(self, input_sr, output_sr=None, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False, device="cpu"): + def __init__(self, input_sr, output_sr=None, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False, device="cpu", fmax_for_spec=8000): """ The parameters are by default set up to do well on a 16kHz signal. A different sampling rate may @@ -28,6 +28,7 @@ def __init__(self, input_sr, output_sr=None, melspec_buckets=80, hop_length=256, self.mel_buckets = melspec_buckets self.meter = pyln.Meter(input_sr) self.final_sr = input_sr + self.fmax_for_spec = fmax_for_spec if cut_silence: torch.hub._validate_not_a_forked_repo = lambda a, b, c: True # torch 1.9 has a bug in the hub loading, this is a workaround # careful: assumes 16kHz or 8kHz audio @@ -87,7 +88,7 @@ def normalize_loudness(self, audio): peak_normed = numpy.divide(loud_normed, peak) return peak_normed - def logmelfilterbank(self, audio, sampling_rate, fmin=40, fmax=8000, eps=1e-10): + def logmelfilterbank(self, audio, sampling_rate, fmin=40, fmax=None, eps=1e-10): """ Compute log-Mel filterbank @@ -96,6 +97,8 @@ def logmelfilterbank(self, audio, sampling_rate, fmin=40, fmax=8000, eps=1e-10): compatibility, this is kept for now. If there is ever a reason to completely re-train all models, this would be a good opportunity to make the switch. """ + if fmax is None: + fmax = self.fmax_for_spec if isinstance(audio, torch.Tensor): audio = audio.numpy() # get amplitude spectrogram diff --git a/Utility/EvaluationScripts/audio_vs_audio.py b/Utility/EvaluationScripts/audio_vs_audio.py index 4ef108b8..c9c60a90 100644 --- a/Utility/EvaluationScripts/audio_vs_audio.py +++ b/Utility/EvaluationScripts/audio_vs_audio.py @@ -1,4 +1,5 @@ import librosa +import librosa.display as lbd import matplotlib.pyplot as plt import numpy import soundfile as sf @@ -141,7 +142,7 @@ def get_pitch_curves(path_1, path_2, plot_curves=False, length_norm=True): pitch_curve_2 = dio(norm_wave_2.unsqueeze(0), norm_by_average=False)[0].squeeze() if length_norm: - # symmetrically remove samples from front and back so we end up with the same amount of frames in both + # symmetrically remove samples from front and back, so we end up with the same amount of frames in both toggle = True while len(pitch_curve_1) > len(pitch_curve_2): if toggle: @@ -162,3 +163,60 @@ def get_pitch_curves(path_1, path_2, plot_curves=False, length_norm=True): plt.show() return pitch_curve_1, pitch_curve_2 + + +def get_pitch_curves_abc(path_1, path_2, path_3): + wave_1, sr_1 = sf.read(path_1) + wave_2, sr_2 = sf.read(path_2) + wave_3, sr_3 = sf.read(path_3) + + ap_1 = AudioPreprocessor(cut_silence=True, input_sr=sr_1, output_sr=16000, fmax_for_spec=1000) + ap_2 = AudioPreprocessor(cut_silence=True, input_sr=sr_2, output_sr=16000, fmax_for_spec=1000) + ap_3 = AudioPreprocessor(cut_silence=True, input_sr=sr_3, output_sr=16000, fmax_for_spec=1000) + + norm_wave_1 = ap_1.audio_to_wave_tensor(wave_1, normalize=True) + norm_wave_2 = ap_2.audio_to_wave_tensor(wave_2, normalize=True) + norm_wave_3 = ap_3.audio_to_wave_tensor(wave_3, normalize=True) + + dio = Dio(fs=16000, use_token_averaged_f0=False, use_log_f0=False, use_continuous_f0=False, n_fft=1024, hop_length=256) + + pitch_curve_1 = dio(norm_wave_1.unsqueeze(0), norm_by_average=False)[0].squeeze() + pitch_curve_2 = dio(norm_wave_2.unsqueeze(0), norm_by_average=False)[0].squeeze() + pitch_curve_3 = dio(norm_wave_3.unsqueeze(0), norm_by_average=False)[0].squeeze() + + fig, ax = plt.subplots(nrows=3, ncols=1) + lbd.specshow(ap_1.audio_to_mel_spec_tensor(wave_1).numpy(), + ax=ax[0], + sr=16000, + cmap='GnBu', + y_axis='mel', + x_axis=None, + hop_length=256) + ax[0].yaxis.set_visible(False) + ax[0].set_title("Human Speech") + ax[0].plot(pitch_curve_1, c="darkred") + + lbd.specshow(ap_2.audio_to_mel_spec_tensor(wave_2).numpy(), + ax=ax[1], + sr=16000, + cmap='GnBu', + y_axis='mel', + x_axis=None, + hop_length=256) + ax[1].yaxis.set_visible(False) + ax[1].set_title("Synthetic Speech 2") + ax[1].plot(pitch_curve_2, c="darkred") + + lbd.specshow(ap_3.audio_to_mel_spec_tensor(wave_3).numpy(), + ax=ax[2], + sr=16000, + cmap='GnBu', + y_axis='mel', + x_axis=None, + hop_length=256) + ax[2].yaxis.set_visible(False) + ax[2].set_title("Synthetic Speech 1") + ax[2].plot(pitch_curve_3, c="darkred") + + plt.tight_layout() + plt.show() diff --git a/run_evaluation.py b/run_evaluation.py index ab417080..41f34ac7 100644 --- a/run_evaluation.py +++ b/run_evaluation.py @@ -3,13 +3,12 @@ from tqdm import tqdm from Utility.EvaluationScripts.audio_vs_audio import ffe +# from Utility.EvaluationScripts.playground_audio_vs_audio_poem import get_pitch_curves_abc from Utility.EvaluationScripts.audio_vs_audio import gpe from Utility.EvaluationScripts.audio_vs_audio import mcd_with_warping from Utility.EvaluationScripts.audio_vs_audio import vde -ffe(f"audios/adept/human/1.wav", f"audios/adept/same_voice_same_style/1.wav") -gpe(f"audios/adept/human/1.wav", f"audios/adept/same_voice_same_style/1.wav") -vde(f"audios/adept/human/1.wav", f"audios/adept/same_voice_same_style/1.wav") +# get_pitch_curves_abc(f"audios/ps2/PoetryStudy/Set5/s5_p1_ref2.wav", f"audios/ps2/PoetryStudy/Set5/s5_p1_ref1.wav", f"audios/ps2/PoetryStudy/Set5/s5_p1_base2_pros_1.wav") mcd_same_style = list() mcd_diff_style = list() @@ -56,9 +55,14 @@ print((1 / len(ffe_diff_style)) * sum(ffe_diff_style)) """ +Results on ADEPT + +25.628487893016743 0.3197801087172897 0.6701621670474431 0.6701621670474431 + +5.585193124795971 0.4236358253409133 0.7842741112107358 0.7842741112107358 diff --git a/run_utterance_cloner.py b/run_utterance_cloner.py index 2e31fcbd..9559b66e 100644 --- a/run_utterance_cloner.py +++ b/run_utterance_cloner.py @@ -31,7 +31,6 @@ def __init__(self, model_id, device): (self.get_speech_timestamps, _, _, _, _) = utils torch.set_grad_enabled(True) # finding this issue was very infuriating: silero sets # this to false globally during model loading rather than using inference mode or no_grad - self.silero_model = self.silero_model.to(self.device) def extract_prosody(self, transcript, ref_audio_path, lang="de", on_line_fine_tune=True): acoustic_model = Aligner() @@ -50,7 +49,7 @@ def extract_prosody(self, transcript, ref_audio_path, lang="de", on_line_fine_tu raise RuntimeError with torch.inference_mode(): - speech_timestamps = self.get_speech_timestamps(norm_wave.to(self.device), self.silero_model, sampling_rate=16000) + speech_timestamps = self.get_speech_timestamps(norm_wave, self.silero_model, sampling_rate=16000) norm_wave = norm_wave[speech_timestamps[0]['start']:speech_timestamps[-1]['end']] norm_wave_length = torch.LongTensor([len(norm_wave)]) From 678cd1a81c7150f028a2464925cd4b4a5d5692d9 Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Thu, 10 Mar 2022 14:45:28 +0100 Subject: [PATCH 23/33] fix noisereduce when tensor is on cuda --- InferenceInterfaces/InferenceFastSpeech2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/InferenceInterfaces/InferenceFastSpeech2.py b/InferenceInterfaces/InferenceFastSpeech2.py index ce5af9e5..1b4e00c2 100644 --- a/InferenceInterfaces/InferenceFastSpeech2.py +++ b/InferenceInterfaces/InferenceFastSpeech2.py @@ -91,7 +91,7 @@ def forward(self, text, view=False, durations=None, pitch=None, energy=None, inp plt.subplots_adjust(left=0.05, bottom=0.1, right=0.95, top=.9, wspace=0.0, hspace=0.0) plt.show() if self.noise_reduce: - wave = torch.tensor(noisereduce.reduce_noise(y=wave, y_noise=self.prototypical_noise, sr=48000, stationary=True), device=self.device) + wave = torch.tensor(noisereduce.reduce_noise(y=wave.cpu().numpy(), y_noise=self.prototypical_noise, sr=48000, stationary=True), device=self.device) return wave def read_to_file(self, text_list, file_location, silent=False, dur_list=None, pitch_list=None, energy_list=None): From dabbfaa426fe219bfa31c1223514ba0e0d811fd4 Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Mon, 14 Mar 2022 11:06:49 +0100 Subject: [PATCH 24/33] change default embedding in meta and train for longer --- .../FastSpeech2/meta_train_loop.py | 4 +- .../FastSpeech2_GermanLowResource.py | 73 ---- .../FastSpeech2_LibriTTS_600.py | 92 ---- .../FastSpeech2_LibriTTS_asr_out.py | 90 ---- .../FastSpeech2_LibriTTS_asr_phn.py | 92 ---- .../FastSpeech2_LibriTTS_asr_phn_600.py | 92 ---- .../FastSpeech2_MetaCheckpoint.py | 4 +- ...astSpeech2_MetaCheckpoint_germ_finetune.py | 358 ---------------- .../FastSpeech2_MetaCheckpoint_no_Germanic.py | 340 --------------- .../FastSpeech2_MetaCheckpoint_no_Slavic.py | 382 ----------------- ...FastSpeech2_MetaCheckpoint_rus_finetune.py | 399 ------------------ .../FastSpeech2_RussianLowResource.py | 73 ---- .../EvaluationScripts/SpeakerVisualization.py | 52 ++- run_speaker_visualization.py | 54 ++- run_training_pipeline.py | 16 - 15 files changed, 96 insertions(+), 2025 deletions(-) delete mode 100644 TrainingInterfaces/TrainingPipelines/FastSpeech2_GermanLowResource.py delete mode 100644 TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_600.py delete mode 100644 TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py delete mode 100644 TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py delete mode 100644 TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn_600.py delete mode 100644 TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint_germ_finetune.py delete mode 100644 TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint_no_Germanic.py delete mode 100644 TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint_no_Slavic.py delete mode 100644 TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint_rus_finetune.py delete mode 100644 TrainingInterfaces/TrainingPipelines/FastSpeech2_RussianLowResource.py diff --git a/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py index a05edaf9..bd5925c6 100644 --- a/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py +++ b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py @@ -51,12 +51,12 @@ def train_loop(net, default_embeddings = {"en": None, "de": None, "el": None, "es": None, "fi": None, "ru": None, "hu": None, "nl": None, "fr": None} for index, lang in enumerate(["en", "de", "el", "es", "fi", "ru", "hu", "nl", "fr"]): default_embedding = None - for datapoint in datasets[index]: + for datapoint in datasets[index][:20]: # default embedding for plotting is the average embedding of the first 20 datapoints for each language if default_embedding is None: default_embedding = datapoint[7].squeeze() else: default_embedding = default_embedding + datapoint[7].squeeze() - default_embeddings[lang] = (default_embedding / len(datasets[index])).to(device) + default_embeddings[lang] = (default_embedding / 20).to(device) optimizer = torch.optim.RAdam(net.parameters(), lr=lr, eps=1.0e-06, weight_decay=0.0) grad_scaler = GradScaler() scheduler = WarmupScheduler(optimizer, warmup_steps=warmup_steps) diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_GermanLowResource.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_GermanLowResource.py deleted file mode 100644 index 4cb05030..00000000 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_GermanLowResource.py +++ /dev/null @@ -1,73 +0,0 @@ -import random - -import soundfile -import torch - -from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2 import FastSpeech2 -from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.fastspeech2_train_loop import train_loop -from Utility.corpus_preparation import prepare_fastspeech_corpus -from Utility.path_to_transcript_dicts import * - - -def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): - if gpu_id == "cpu": - os.environ["CUDA_VISIBLE_DEVICES"] = "" - device = torch.device("cpu") - - else: - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}" - device = torch.device("cuda") - - torch.manual_seed(131714) - random.seed(131714) - torch.random.manual_seed(131714) - - print("Preparing") - - if model_dir is not None: - save_dir = model_dir - else: - save_dir = os.path.join("Models", "FastSpeech2_German_low_resource") - os.makedirs(save_dir, exist_ok=True) - - path_to_transcript_dict_ = build_path_to_transcript_dict_karlsson() - path_to_transcript_dict = dict() - - paths = list(path_to_transcript_dict_.keys()) - used_samples = set() - total_len = 0.0 - while total_len < 5.0 * 60.0: - path = random.choice(paths) - x, sr = soundfile.read(path) - duration = len(x) / sr - if 10 > duration > 5 and path not in used_samples: - used_samples.add(path) - total_len += duration - - print(f"Collected {total_len / 60.0} minutes worth of samples.") - - for key in path_to_transcript_dict_: - if key in used_samples: - path_to_transcript_dict[key] = path_to_transcript_dict_[key] - - train_set = prepare_fastspeech_corpus(transcript_dict=path_to_transcript_dict, - corpus_dir=os.path.join("Corpora", "German_low_resource"), - lang="de") - - model = FastSpeech2(lang_embs=100) - # because we want to finetune it, we treat it as multilingual and multispeaker model, even though it only has one speaker - - print("Training model") - train_loop(net=model, - train_dataset=train_set, - device=device, - save_directory=save_dir, - steps=10000, - batch_size=32, - lang="de", - lr=0.0001, - epochs_per_save=20, - path_to_checkpoint="Models/FastSpeech2_Meta_no_ger/best.pt", - fine_tune=True, - resume=resume) diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_600.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_600.py deleted file mode 100644 index a91eb5d2..00000000 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_600.py +++ /dev/null @@ -1,92 +0,0 @@ -import random - -import torch -from torch.utils.data import ConcatDataset -from tqdm import tqdm - -from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2 import FastSpeech2 -from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.fastspeech2_train_loop import train_loop -from Utility.corpus_preparation import prepare_fastspeech_corpus -from Utility.path_to_transcript_dicts import * - - -def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): - if gpu_id == "cpu": - os.environ["CUDA_VISIBLE_DEVICES"] = "" - device = torch.device("cpu") - - else: - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(gpu_id) - device = torch.device("cuda") - - torch.manual_seed(131714) - random.seed(131714) - torch.random.manual_seed(131714) - - print("Preparing") - - if model_dir is not None: - save_dir = model_dir - else: - save_dir = os.path.join("Models", "FastSpeech2_LibriTTS_600") - os.makedirs(save_dir, exist_ok=True) - - train_set = prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts_other500(), - corpus_dir=os.path.join("Corpora", "libri_500"), - lang="en", - phone_input=False, - ctc_selection=True) - - model = FastSpeech2(lang_embs=None) - - train_sets = list() - train_sets.append(train_set) - train_sets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts(), - corpus_dir=os.path.join("Corpora", "libri"), - lang="en", - phone_input=False, - ctc_selection=True)) - - train_set = ConcatDataset(train_sets) - - print("Training model") - train_loop(net=model, - train_dataset=train_set, - device=device, - save_directory=save_dir, - steps=500000, - batch_size=32, - lang="en", - lr=0.0001, - warmup_steps=4000, - path_to_checkpoint="Models/FastSpeech2_LibriTTS/best.pt", - fine_tune=True, - resume=resume) - - -@torch.inference_mode() -def find_faulty_samples(net, - dataset, - device, - path_to_checkpoint): - nan_ids = list() - net = net.to(device) - torch.multiprocessing.set_sharing_strategy('file_system') - check_dict = torch.load(os.path.join(path_to_checkpoint), map_location=device) - net.load_state_dict(check_dict["model"]) - for datapoint_index in tqdm(range(len(dataset))): - loss = net(text_tensors=dataset[datapoint_index][0].unsqueeze(0).to(device), - text_lengths=dataset[datapoint_index][1].to(device), - gold_speech=dataset[datapoint_index][2].unsqueeze(0).to(device), - speech_lengths=dataset[datapoint_index][3].to(device), - gold_durations=dataset[datapoint_index][4].unsqueeze(0).to(device), - gold_pitch=dataset[datapoint_index][6].unsqueeze(0).to(device), # mind the switched order - gold_energy=dataset[datapoint_index][5].unsqueeze(0).to(device), # mind the switched order - utterance_embedding=dataset[datapoint_index][7].unsqueeze(0).to(device), - lang_ids=None, - return_mels=False).squeeze() - if torch.isnan(loss): - print(f"CAREFUL, NAN DETECTED: {datapoint_index}") - nan_ids.append(datapoint_index) - dataset.remove_samples(nan_ids) diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py deleted file mode 100644 index c432f843..00000000 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_out.py +++ /dev/null @@ -1,90 +0,0 @@ -import random - -import torch -from tqdm import tqdm - -from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2 import FastSpeech2 -from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.fastspeech2_train_loop import train_loop -from Utility.corpus_preparation import prepare_fastspeech_corpus -from Utility.path_to_transcript_dicts import * - - -def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): - if gpu_id == "cpu": - os.environ["CUDA_VISIBLE_DEVICES"] = "" - device = torch.device("cpu") - - else: - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(gpu_id) - device = torch.device("cuda") - - torch.manual_seed(131714) - random.seed(131714) - torch.random.manual_seed(131714) - - print("Preparing") - - if model_dir is not None: - save_dir = model_dir - else: - save_dir = os.path.join("Models", "FastSpeech2_LibriTTS_asr_out") - os.makedirs(save_dir, exist_ok=True) - - train_set = prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts_asr_out(), - corpus_dir=os.path.join("Corpora", "libri_asr_out"), - lang="en") - - model = FastSpeech2(lang_embs=None) - - print("Training model") - train_loop(net=model, - train_dataset=train_set, - device=device, - save_directory=save_dir, - steps=500000, - batch_size=32, - lang="en", - lr=0.0001, - warmup_steps=4000, - path_to_checkpoint="Models/FastSpeech2_LibriTTS/best.pt", - fine_tune=True, - resume=resume) - - -@torch.inference_mode() -def find_faulty_samples(net, - datasets, - device, - path_to_checkpoint): - net = net.to(device) - torch.multiprocessing.set_sharing_strategy('file_system') - check_dict = torch.load(os.path.join(path_to_checkpoint), map_location=device) - net.load_state_dict(check_dict["model"]) - losses = list() - index_pairs = list() - for datapoint_index in tqdm(range(len(datasets))): - loss = net(text_tensors=datasets[datapoint_index][0].unsqueeze(0).to(device), - text_lengths=datasets[datapoint_index][1].to(device), - gold_speech=datasets[datapoint_index][2].unsqueeze(0).to(device), - speech_lengths=datasets[datapoint_index][3].to(device), - gold_durations=datasets[datapoint_index][4].unsqueeze(0).to(device), - gold_pitch=datasets[datapoint_index][6].unsqueeze(0).to(device), # mind the switched order - gold_energy=datasets[datapoint_index][5].unsqueeze(0).to(device), # mind the switched order - utterance_embedding=datasets[datapoint_index][7].unsqueeze(0).to(device), - lang_ids=None, - return_mels=False).squeeze() - if torch.isnan(loss): - print(f"CAREFUL, NAN DETECTED: {datapoint_index}") - losses.append(999999) - else: - losses.append(loss.item()) - index_pairs.append(datapoint_index) - - loss_high_to_low = sorted(losses, reverse=True) - print(loss_high_to_low) - threshold = loss_high_to_low[500] - for index, loss in enumerate(losses): - if loss > threshold: - print(index_pairs[index]) - print(loss) diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py deleted file mode 100644 index e724ed35..00000000 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn.py +++ /dev/null @@ -1,92 +0,0 @@ -import random - -import torch -from tqdm import tqdm - -from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2 import FastSpeech2 -from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.fastspeech2_train_loop import train_loop -from Utility.corpus_preparation import prepare_fastspeech_corpus -from Utility.path_to_transcript_dicts import * - - -def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): - if gpu_id == "cpu": - os.environ["CUDA_VISIBLE_DEVICES"] = "" - device = torch.device("cpu") - - else: - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(gpu_id) - device = torch.device("cuda") - - torch.manual_seed(131714) - random.seed(131714) - torch.random.manual_seed(131714) - - print("Preparing") - - if model_dir is not None: - save_dir = model_dir - else: - save_dir = os.path.join("Models", "FastSpeech2_LibriTTS_asr_phn") - os.makedirs(save_dir, exist_ok=True) - - train_set = prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts_asr_phn(), - corpus_dir=os.path.join("Corpora", "libri_asr_phn"), - lang="en", - phone_input=True, - ctc_selection=False) - - model = FastSpeech2(lang_embs=None) - - print("Training model") - train_loop(net=model, - train_dataset=train_set, - device=device, - save_directory=save_dir, - steps=500000, - batch_size=32, - lang="en", - lr=0.0001, - warmup_steps=4000, - path_to_checkpoint="Models/FastSpeech2_LibriTTS/best.pt", - fine_tune=True, - resume=resume) - - -@torch.inference_mode() -def find_faulty_samples(net, - datasets, - device, - path_to_checkpoint): - net = net.to(device) - torch.multiprocessing.set_sharing_strategy('file_system') - check_dict = torch.load(os.path.join(path_to_checkpoint), map_location=device) - net.load_state_dict(check_dict["model"]) - losses = list() - index_pairs = list() - for datapoint_index in tqdm(range(len(datasets))): - loss = net(text_tensors=datasets[datapoint_index][0].unsqueeze(0).to(device), - text_lengths=datasets[datapoint_index][1].to(device), - gold_speech=datasets[datapoint_index][2].unsqueeze(0).to(device), - speech_lengths=datasets[datapoint_index][3].to(device), - gold_durations=datasets[datapoint_index][4].unsqueeze(0).to(device), - gold_pitch=datasets[datapoint_index][6].unsqueeze(0).to(device), # mind the switched order - gold_energy=datasets[datapoint_index][5].unsqueeze(0).to(device), # mind the switched order - utterance_embedding=datasets[datapoint_index][7].unsqueeze(0).to(device), - lang_ids=None, - return_mels=False).squeeze() - if torch.isnan(loss): - print(f"CAREFUL, NAN DETECTED: {datapoint_index}") - losses.append(999999) - else: - losses.append(loss.item()) - index_pairs.append(datapoint_index) - - loss_high_to_low = sorted(losses, reverse=True) - print(loss_high_to_low) - threshold = loss_high_to_low[500] - for index, loss in enumerate(losses): - if loss > threshold: - print(index_pairs[index]) - print(loss) diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn_600.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn_600.py deleted file mode 100644 index c29217f0..00000000 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_LibriTTS_asr_phn_600.py +++ /dev/null @@ -1,92 +0,0 @@ -import random - -import torch -from torch.utils.data import ConcatDataset -from tqdm import tqdm - -from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2 import FastSpeech2 -from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.fastspeech2_train_loop import train_loop -from Utility.corpus_preparation import prepare_fastspeech_corpus -from Utility.path_to_transcript_dicts import * - - -def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): - if gpu_id == "cpu": - os.environ["CUDA_VISIBLE_DEVICES"] = "" - device = torch.device("cpu") - - else: - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(gpu_id) - device = torch.device("cuda") - - torch.manual_seed(131714) - random.seed(131714) - torch.random.manual_seed(131714) - - print("Preparing") - - if model_dir is not None: - save_dir = model_dir - else: - save_dir = os.path.join("Models", "FastSpeech2_LibriTTS_asr_phn_600") - os.makedirs(save_dir, exist_ok=True) - - train_set = prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts_asr_phn_500(), - corpus_dir=os.path.join("Corpora", "libri_asr_phn_500"), - lang="en", - phone_input=True, - ctc_selection=False) - - model = FastSpeech2(lang_embs=None) - - train_sets = list() - train_sets.append(train_set) - train_sets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts_asr_phn(), - corpus_dir=os.path.join("Corpora", "libri_asr_phn"), - lang="en", - phone_input=True, - ctc_selection=False)) - - train_set = ConcatDataset(train_sets) - - print("Training model") - train_loop(net=model, - train_dataset=train_set, - device=device, - save_directory=save_dir, - steps=500000, - batch_size=32, - lang="en", - lr=0.0001, - warmup_steps=4000, - path_to_checkpoint="Models/FastSpeech2_LibriTTS_asr_phn/best.pt", - fine_tune=True, - resume=resume) - - -@torch.inference_mode() -def find_faulty_samples(net, - dataset, - device, - path_to_checkpoint): - nan_ids = list() - net = net.to(device) - torch.multiprocessing.set_sharing_strategy('file_system') - check_dict = torch.load(os.path.join(path_to_checkpoint), map_location=device) - net.load_state_dict(check_dict["model"]) - for datapoint_index in tqdm(range(len(dataset))): - loss = net(text_tensors=dataset[datapoint_index][0].unsqueeze(0).to(device), - text_lengths=dataset[datapoint_index][1].to(device), - gold_speech=dataset[datapoint_index][2].unsqueeze(0).to(device), - speech_lengths=dataset[datapoint_index][3].to(device), - gold_durations=dataset[datapoint_index][4].unsqueeze(0).to(device), - gold_pitch=dataset[datapoint_index][6].unsqueeze(0).to(device), # mind the switched order - gold_energy=dataset[datapoint_index][5].unsqueeze(0).to(device), # mind the switched order - utterance_embedding=dataset[datapoint_index][7].unsqueeze(0).to(device), - lang_ids=None, - return_mels=False).squeeze() - if torch.isnan(loss): - print(f"CAREFUL, NAN DETECTED: {datapoint_index}") - nan_ids.append(datapoint_index) - dataset.remove_samples(nan_ids) diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py index 48e06849..d7c3139b 100644 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py +++ b/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py @@ -169,9 +169,9 @@ def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, remove_faulty_sa train_loop(net=FastSpeech2(lang_embs=100), device=torch.device("cuda"), datasets=datasets, - batch_size=5, + batch_size=6, save_directory=meta_save_dir, - steps=100000, + steps=200000, steps_per_checkpoint=1000, lr=0.001, path_to_checkpoint=resume_checkpoint, diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint_germ_finetune.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint_germ_finetune.py deleted file mode 100644 index a94fd448..00000000 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint_germ_finetune.py +++ /dev/null @@ -1,358 +0,0 @@ -import random - -import librosa.display as lbd -import matplotlib.pyplot as plt -import torch -import torch.multiprocessing -from torch.cuda.amp import GradScaler -from torch.cuda.amp import autocast -from torch.nn.utils.rnn import pad_sequence -from torch.utils.data.dataloader import DataLoader -from tqdm import tqdm - -from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend -from Preprocessing.ArticulatoryCombinedTextFrontend import get_language_id -from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2 import FastSpeech2 -from Utility.WarmupScheduler import WarmupScheduler -from Utility.corpus_preparation import prepare_fastspeech_corpus -from Utility.path_to_transcript_dicts import * -from Utility.utils import cumsum_durations -from Utility.utils import delete_old_checkpoints -from Utility.utils import get_most_recent_checkpoint - - -def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, find_faulty_samples_mode=False): - torch.manual_seed(131714) - random.seed(131714) - torch.random.manual_seed(131714) - - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}" - - datasets = list() - - resume_checkpoint = "Models/FastSpeech2_German_low_resource/best.pt" - - base_dir = os.path.join("Models", "FastSpeech2_Meta_joint_finetune_german") - if model_dir is not None: - meta_save_dir = model_dir - else: - meta_save_dir = base_dir - os.makedirs(meta_save_dir, exist_ok=True) - - print("Preparing") - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_karlsson(), - corpus_dir=os.path.join("Corpora", "German_low_resource"), - lang="de")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_spanish(), - corpus_dir=os.path.join("Corpora", "mls_spanish"), - lang="es", - ctc_selection=False)) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10el(), - corpus_dir=os.path.join("Corpora", "meta_Greek"), - lang="el")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10fi(), - corpus_dir=os.path.join("Corpora", "meta_Finnish"), - lang="fi")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10ru(), - corpus_dir=os.path.join("Corpora", "meta_Russian"), - lang="ru")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10hu(), - corpus_dir=os.path.join("Corpora", "meta_Hungarian"), - lang="hu")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10fr(), - corpus_dir=os.path.join("Corpora", "meta_French"), - lang="fr")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_spanish_blizzard_train(), - corpus_dir=os.path.join("Corpora", "spanish_blizzard"), - lang="es")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_portuguese(), - corpus_dir=os.path.join("Corpora", "mls_porto"), - lang="pt", - ctc_selection=False)) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_polish(), - corpus_dir=os.path.join("Corpora", "mls_polish"), - lang="pl", - ctc_selection=False)) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10es(), - corpus_dir=os.path.join("Corpora", "meta_Spanish"), - lang="es")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_french(), - corpus_dir=os.path.join("Corpora", "mls_french"), - lang="fr", - ctc_selection=False)) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_italian(), - corpus_dir=os.path.join("Corpora", "mls_italian"), - lang="it", - ctc_selection=False)) - - if find_faulty_samples_mode: - find_faulty_samples(net=FastSpeech2(lang_embs=100), - datasets=datasets, - device=torch.device("cuda"), - path_to_checkpoint=resume_checkpoint) - else: - train_loop(net=FastSpeech2(lang_embs=100), - device=torch.device("cuda"), - datasets=datasets, - batch_size=4, - save_directory=meta_save_dir, - steps=100000, - steps_per_checkpoint=1000, - lr=0.0001, - path_to_checkpoint=resume_checkpoint, - resume=resume) - - -@torch.no_grad() -def find_faulty_samples(net, - datasets, - device, - path_to_checkpoint): - net = net.to(device) - torch.multiprocessing.set_sharing_strategy('file_system') - check_dict = torch.load(os.path.join(path_to_checkpoint), map_location=device) - net.load_state_dict(check_dict["model"]) - losses = list() - index_pairs = list() - for dataset_index in range(len(datasets)): - for datapoint_index in tqdm(range(len(datasets[dataset_index]))): - loss = net(text_tensors=datasets[dataset_index][datapoint_index][0].unsqueeze(0).to(device), - text_lengths=datasets[dataset_index][datapoint_index][1].to(device), - gold_speech=datasets[dataset_index][datapoint_index][2].unsqueeze(0).to(device), - speech_lengths=datasets[dataset_index][datapoint_index][3].to(device), - gold_durations=datasets[dataset_index][datapoint_index][4].unsqueeze(0).to(device), - gold_pitch=datasets[dataset_index][datapoint_index][6].unsqueeze(0).to(device), # mind the switched order - gold_energy=datasets[dataset_index][datapoint_index][5].unsqueeze(0).to(device), # mind the switched order - utterance_embedding=datasets[dataset_index][datapoint_index][7].unsqueeze(0).to(device), - lang_ids=datasets[dataset_index][datapoint_index][8].unsqueeze(0).to(device), - return_mels=False).squeeze() - if torch.isnan(loss): - print(f"CAREFUL, NAN DETECTED: {dataset_index}, {datapoint_index}") - losses.append(loss.item()) - index_pairs.append((dataset_index, datapoint_index)) - loss_high_to_low = sorted(losses, reverse=True) - print(loss_high_to_low) - threshold = loss_high_to_low[1000] - for index, loss in enumerate(losses): - if loss > threshold: - print(index_pairs[index]) - print(loss) - - -def train_loop(net, - datasets, - device, - save_directory, - batch_size, - steps, - steps_per_checkpoint, - lr, - path_to_checkpoint, - resume=False): - # ============ - # Preparations - # ============ - net = net.to(device) - torch.multiprocessing.set_sharing_strategy('file_system') - train_loaders = list() - train_iters = list() - for i, dataset in enumerate(datasets): - if i == 0: - train_loaders.append(DataLoader(batch_size=12, - dataset=dataset, - drop_last=True, - num_workers=2, - pin_memory=True, - shuffle=True, - prefetch_factor=5, - collate_fn=collate_and_pad, - persistent_workers=True)) - else: - train_loaders.append(DataLoader(batch_size=2, - dataset=dataset, - drop_last=True, - num_workers=2, - pin_memory=True, - shuffle=True, - prefetch_factor=5, - collate_fn=collate_and_pad, - persistent_workers=True)) - train_iters.append(iter(train_loaders[-1])) - - default_embeddings = {"de": None, "el": None, "es": None, "fi": None, "ru": None, "hu": None, "nl": None, "fr": None} - for index, lang in enumerate(["de", "el", "es", "fi", "ru", "hu", "nl", "fr"]): - default_embedding = None - for datapoint in datasets[index]: - if default_embedding is None: - default_embedding = datapoint[7].squeeze() - else: - default_embedding = default_embedding + datapoint[7].squeeze() - default_embeddings[lang] = (default_embedding / len(datasets[index])).to(device) - optimizer = torch.optim.RAdam(net.parameters(), lr=lr, eps=1.0e-06, weight_decay=0.0) - grad_scaler = GradScaler() - scheduler = WarmupScheduler(optimizer, warmup_steps=4000) - if resume: - previous_checkpoint = get_most_recent_checkpoint(checkpoint_dir=save_directory) - if previous_checkpoint is not None: - path_to_checkpoint = previous_checkpoint - else: - raise RuntimeError(f"No checkpoint found that can be resumed from in {save_directory}") - step_counter = 0 - train_losses_total = list() - if path_to_checkpoint is not None: - check_dict = torch.load(os.path.join(path_to_checkpoint), map_location=device) - net.load_state_dict(check_dict["model"]) - if resume: - optimizer.load_state_dict(check_dict["optimizer"]) - step_counter = check_dict["step_counter"] - grad_scaler.load_state_dict(check_dict["scaler"]) - scheduler.load_state_dict(check_dict["scheduler"]) - if step_counter > steps: - print("Desired steps already reached in loaded checkpoint.") - return - - net.train() - # ============================= - # Actual train loop starts here - # ============================= - for step in tqdm(range(step_counter, steps)): - batches = [] - for index in range(len(datasets)): - # we get one batch for each task (i.e. language in this case) - try: - batch = next(train_iters[index]) - batches.append(batch) - except StopIteration: - train_iters[index] = iter(train_loaders[index]) - batch = next(train_iters[index]) - batches.append(batch) - train_loss = 0.0 - for batch in batches: - with autocast(): - # we sum the loss for each task, as we would do for the - # second order regular MAML, but we do it only over one - # step (i.e. iterations of inner loop = 1) - train_loss = train_loss + net(text_tensors=batch[0].to(device), - text_lengths=batch[1].to(device), - gold_speech=batch[2].to(device), - speech_lengths=batch[3].to(device), - gold_durations=batch[4].to(device), - gold_pitch=batch[6].to(device), # mind the switched order - gold_energy=batch[5].to(device), # mind the switched order - utterance_embedding=batch[7].to(device), - lang_ids=batch[8].to(device), - return_mels=False) - # then we directly update our meta-parameters without - # the need for any task specific parameters - train_losses_total.append(train_loss.item()) - optimizer.zero_grad() - grad_scaler.scale(train_loss).backward() - grad_scaler.unscale_(optimizer) - torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0, error_if_nonfinite=False) - grad_scaler.step(optimizer) - grad_scaler.update() - scheduler.step() - - if step % steps_per_checkpoint == 0: - # ============================== - # Enough steps for some insights - # ============================== - net.eval() - print(f"Total Loss: {round(sum(train_losses_total) / len(train_losses_total), 3)}") - train_losses_total = list() - torch.save({ - "model": net.state_dict(), - "optimizer": optimizer.state_dict(), - "scaler": grad_scaler.state_dict(), - "scheduler": scheduler.state_dict(), - "step_counter": step, - "default_emb": default_embeddings["es"] - }, - os.path.join(save_directory, "checkpoint_{}.pt".format(step))) - delete_old_checkpoints(save_directory, keep=5) - for lang in ["de", "el", "es", "fi", "ru", "hu", "nl", "fr"]: - plot_progress_spec(net=net, - device=device, - lang=lang, - save_dir=save_directory, - step=step, - utt_embeds=default_embeddings) - net.train() - - -@torch.inference_mode() -def plot_progress_spec(net, device, save_dir, step, lang, utt_embeds): - tf = ArticulatoryCombinedTextFrontend(language=lang) - sentence = "" - default_embed = utt_embeds[lang] - if lang == "en": - sentence = "This is a complex sentence, it even has a pause!" - elif lang == "de": - sentence = "Dies ist ein komplexer Satz, er hat sogar eine Pause!" - elif lang == "el": - sentence = "Αυτή είναι μια σύνθετη πρόταση, έχει ακόμη και παύση!" - elif lang == "es": - sentence = "Esta es una oración compleja, ¡incluso tiene una pausa!" - elif lang == "fi": - sentence = "Tämä on monimutkainen lause, sillä on jopa tauko!" - elif lang == "ru": - sentence = "Это сложное предложение, в нем даже есть пауза!" - elif lang == "hu": - sentence = "Ez egy összetett mondat, még szünet is van benne!" - elif lang == "nl": - sentence = "Dit is een complexe zin, er zit zelfs een pauze in!" - elif lang == "fr": - sentence = "C'est une phrase complexe, elle a même une pause !" - phoneme_vector = tf.string_to_tensor(sentence).squeeze(0).to(device) - spec, durations, *_ = net.inference(text=phoneme_vector, - return_duration_pitch_energy=True, - utterance_embedding=default_embed, - lang_id=get_language_id(lang).to(device)) - spec = spec.transpose(0, 1).to("cpu").numpy() - duration_splits, label_positions = cumsum_durations(durations.cpu().numpy()) - if not os.path.exists(os.path.join(save_dir, "spec")): - os.makedirs(os.path.join(save_dir, "spec")) - fig, ax = plt.subplots(nrows=1, ncols=1) - lbd.specshow(spec, - ax=ax, - sr=16000, - cmap='GnBu', - y_axis='mel', - x_axis=None, - hop_length=256) - ax.yaxis.set_visible(False) - ax.set_xticks(duration_splits, minor=True) - ax.xaxis.grid(True, which='minor') - ax.set_xticks(label_positions, minor=False) - ax.set_xticklabels(tf.get_phone_string(sentence)) - ax.set_title(sentence) - plt.savefig(os.path.join(os.path.join(save_dir, "spec"), f"{step}_{lang}.png")) - plt.clf() - plt.close() - - -def collate_and_pad(batch): - # text, text_len, speech, speech_len, durations, energy, pitch, utterance condition, language_id - return (pad_sequence([datapoint[0] for datapoint in batch], batch_first=True), - torch.stack([datapoint[1] for datapoint in batch]).squeeze(1), - pad_sequence([datapoint[2] for datapoint in batch], batch_first=True), - torch.stack([datapoint[3] for datapoint in batch]).squeeze(1), - pad_sequence([datapoint[4] for datapoint in batch], batch_first=True), - pad_sequence([datapoint[5] for datapoint in batch], batch_first=True), - pad_sequence([datapoint[6] for datapoint in batch], batch_first=True), - torch.stack([datapoint[7] for datapoint in batch]).squeeze(), - torch.stack([datapoint[8] for datapoint in batch])) diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint_no_Germanic.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint_no_Germanic.py deleted file mode 100644 index f716953f..00000000 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint_no_Germanic.py +++ /dev/null @@ -1,340 +0,0 @@ -import random - -import librosa.display as lbd -import matplotlib.pyplot as plt -import torch -import torch.multiprocessing -from torch.cuda.amp import GradScaler -from torch.cuda.amp import autocast -from torch.nn.utils.rnn import pad_sequence -from torch.utils.data.dataloader import DataLoader -from tqdm import tqdm - -from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend -from Preprocessing.ArticulatoryCombinedTextFrontend import get_language_id -from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2 import FastSpeech2 -from Utility.WarmupScheduler import WarmupScheduler -from Utility.corpus_preparation import prepare_fastspeech_corpus -from Utility.path_to_transcript_dicts import * -from Utility.utils import cumsum_durations -from Utility.utils import delete_old_checkpoints -from Utility.utils import get_most_recent_checkpoint - - -def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, find_faulty_samples_mode=False): - torch.manual_seed(131714) - random.seed(131714) - torch.random.manual_seed(131714) - - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}" - - datasets = list() - - base_dir = os.path.join("Models", "FastSpeech2_Meta_no_ger") - if model_dir is not None: - meta_save_dir = model_dir - else: - meta_save_dir = base_dir - os.makedirs(meta_save_dir, exist_ok=True) - - print("Preparing") - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_spanish(), - corpus_dir=os.path.join("Corpora", "mls_spanish"), - lang="es", - ctc_selection=False)) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10el(), - corpus_dir=os.path.join("Corpora", "meta_Greek"), - lang="el")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10fi(), - corpus_dir=os.path.join("Corpora", "meta_Finnish"), - lang="fi")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10ru(), - corpus_dir=os.path.join("Corpora", "meta_Russian"), - lang="ru")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10hu(), - corpus_dir=os.path.join("Corpora", "meta_Hungarian"), - lang="hu")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10fr(), - corpus_dir=os.path.join("Corpora", "meta_French"), - lang="fr")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_spanish_blizzard_train(), - corpus_dir=os.path.join("Corpora", "spanish_blizzard"), - lang="es")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_portuguese(), - corpus_dir=os.path.join("Corpora", "mls_porto"), - lang="pt", - ctc_selection=False)) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_polish(), - corpus_dir=os.path.join("Corpora", "mls_polish"), - lang="pl", - ctc_selection=False)) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10es(), - corpus_dir=os.path.join("Corpora", "meta_Spanish"), - lang="es")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_french(), - corpus_dir=os.path.join("Corpora", "mls_french"), - lang="fr", - ctc_selection=False)) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_italian(), - corpus_dir=os.path.join("Corpora", "mls_italian"), - lang="it", - ctc_selection=False)) - - if find_faulty_samples_mode: - find_faulty_samples(net=FastSpeech2(lang_embs=100), - datasets=datasets, - device=torch.device("cuda"), - path_to_checkpoint=resume_checkpoint) - else: - train_loop(net=FastSpeech2(lang_embs=100), - device=torch.device("cuda"), - datasets=datasets, - batch_size=4, - save_directory=meta_save_dir, - steps=100000, - steps_per_checkpoint=1000, - lr=0.0001, - path_to_checkpoint=resume_checkpoint, - resume=resume) - - -@torch.no_grad() -def find_faulty_samples(net, - datasets, - device, - path_to_checkpoint): - net = net.to(device) - torch.multiprocessing.set_sharing_strategy('file_system') - check_dict = torch.load(os.path.join(path_to_checkpoint), map_location=device) - net.load_state_dict(check_dict["model"]) - losses = list() - index_pairs = list() - for dataset_index in range(len(datasets)): - for datapoint_index in tqdm(range(len(datasets[dataset_index]))): - loss = net(text_tensors=datasets[dataset_index][datapoint_index][0].unsqueeze(0).to(device), - text_lengths=datasets[dataset_index][datapoint_index][1].to(device), - gold_speech=datasets[dataset_index][datapoint_index][2].unsqueeze(0).to(device), - speech_lengths=datasets[dataset_index][datapoint_index][3].to(device), - gold_durations=datasets[dataset_index][datapoint_index][4].unsqueeze(0).to(device), - gold_pitch=datasets[dataset_index][datapoint_index][6].unsqueeze(0).to(device), # mind the switched order - gold_energy=datasets[dataset_index][datapoint_index][5].unsqueeze(0).to(device), # mind the switched order - utterance_embedding=datasets[dataset_index][datapoint_index][7].unsqueeze(0).to(device), - lang_ids=datasets[dataset_index][datapoint_index][8].unsqueeze(0).to(device), - return_mels=False).squeeze() - if torch.isnan(loss): - print(f"CAREFUL, NAN DETECTED: {dataset_index}, {datapoint_index}") - losses.append(loss.item()) - index_pairs.append((dataset_index, datapoint_index)) - loss_high_to_low = sorted(losses, reverse=True) - print(loss_high_to_low) - threshold = loss_high_to_low[1000] - for index, loss in enumerate(losses): - if loss > threshold: - print(index_pairs[index]) - print(loss) - - -def train_loop(net, - datasets, - device, - save_directory, - batch_size, - steps, - steps_per_checkpoint, - lr, - path_to_checkpoint, - resume=False): - # ============ - # Preparations - # ============ - net = net.to(device) - torch.multiprocessing.set_sharing_strategy('file_system') - train_loaders = list() - train_iters = list() - for dataset in datasets: - train_loaders.append(DataLoader(batch_size=batch_size, - dataset=dataset, - drop_last=True, - num_workers=2, - pin_memory=True, - shuffle=True, - prefetch_factor=5, - collate_fn=collate_and_pad, - persistent_workers=True)) - train_iters.append(iter(train_loaders[-1])) - default_embeddings = {"el": None, "es": None, "fi": None, "ru": None, "hu": None, "nl": None, "fr": None} - for index, lang in enumerate(["el", "es", "fi", "ru", "hu", "nl", "fr"]): - default_embedding = None - for datapoint in datasets[index]: - if default_embedding is None: - default_embedding = datapoint[7].squeeze() - else: - default_embedding = default_embedding + datapoint[7].squeeze() - default_embeddings[lang] = (default_embedding / len(datasets[index])).to(device) - optimizer = torch.optim.RAdam(net.parameters(), lr=lr, eps=1.0e-06, weight_decay=0.0) - grad_scaler = GradScaler() - scheduler = WarmupScheduler(optimizer, warmup_steps=4000) - if resume: - previous_checkpoint = get_most_recent_checkpoint(checkpoint_dir=save_directory) - if previous_checkpoint is not None: - path_to_checkpoint = previous_checkpoint - else: - raise RuntimeError(f"No checkpoint found that can be resumed from in {save_directory}") - step_counter = 0 - train_losses_total = list() - if path_to_checkpoint is not None: - check_dict = torch.load(os.path.join(path_to_checkpoint), map_location=device) - net.load_state_dict(check_dict["model"]) - if resume: - optimizer.load_state_dict(check_dict["optimizer"]) - step_counter = check_dict["step_counter"] - grad_scaler.load_state_dict(check_dict["scaler"]) - scheduler.load_state_dict(check_dict["scheduler"]) - if step_counter > steps: - print("Desired steps already reached in loaded checkpoint.") - return - - net.train() - # ============================= - # Actual train loop starts here - # ============================= - for step in tqdm(range(step_counter, steps)): - batches = [] - for index in range(len(datasets)): - # we get one batch for each task (i.e. language in this case) - try: - batch = next(train_iters[index]) - batches.append(batch) - except StopIteration: - train_iters[index] = iter(train_loaders[index]) - batch = next(train_iters[index]) - batches.append(batch) - train_loss = 0.0 - for batch in batches: - with autocast(): - # we sum the loss for each task, as we would do for the - # second order regular MAML, but we do it only over one - # step (i.e. iterations of inner loop = 1) - train_loss = train_loss + net(text_tensors=batch[0].to(device), - text_lengths=batch[1].to(device), - gold_speech=batch[2].to(device), - speech_lengths=batch[3].to(device), - gold_durations=batch[4].to(device), - gold_pitch=batch[6].to(device), # mind the switched order - gold_energy=batch[5].to(device), # mind the switched order - utterance_embedding=batch[7].to(device), - lang_ids=batch[8].to(device), - return_mels=False) - # then we directly update our meta-parameters without - # the need for any task specific parameters - train_losses_total.append(train_loss.item()) - optimizer.zero_grad() - grad_scaler.scale(train_loss).backward() - grad_scaler.unscale_(optimizer) - torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0, error_if_nonfinite=False) - grad_scaler.step(optimizer) - grad_scaler.update() - scheduler.step() - - if step % steps_per_checkpoint == 0: - # ============================== - # Enough steps for some insights - # ============================== - net.eval() - print(f"Total Loss: {round(sum(train_losses_total) / len(train_losses_total), 3)}") - train_losses_total = list() - torch.save({ - "model" : net.state_dict(), - "optimizer" : optimizer.state_dict(), - "scaler" : grad_scaler.state_dict(), - "scheduler" : scheduler.state_dict(), - "step_counter": step, - "default_emb": default_embeddings["es"] - }, - os.path.join(save_directory, "checkpoint_{}.pt".format(step))) - delete_old_checkpoints(save_directory, keep=5) - for lang in ["el", "es", "fi", "ru", "hu", "nl", "fr"]: - plot_progress_spec(net=net, - device=device, - lang=lang, - save_dir=save_directory, - step=step, - utt_embeds=default_embeddings) - net.train() - - -@torch.inference_mode() -def plot_progress_spec(net, device, save_dir, step, lang, utt_embeds): - tf = ArticulatoryCombinedTextFrontend(language=lang) - sentence = "" - default_embed = utt_embeds[lang] - if lang == "en": - sentence = "This is a complex sentence, it even has a pause!" - elif lang == "de": - sentence = "Dies ist ein komplexer Satz, er hat sogar eine Pause!" - elif lang == "el": - sentence = "Αυτή είναι μια σύνθετη πρόταση, έχει ακόμη και παύση!" - elif lang == "es": - sentence = "Esta es una oración compleja, ¡incluso tiene una pausa!" - elif lang == "fi": - sentence = "Tämä on monimutkainen lause, sillä on jopa tauko!" - elif lang == "ru": - sentence = "Это сложное предложение, в нем даже есть пауза!" - elif lang == "hu": - sentence = "Ez egy összetett mondat, még szünet is van benne!" - elif lang == "nl": - sentence = "Dit is een complexe zin, er zit zelfs een pauze in!" - elif lang == "fr": - sentence = "C'est une phrase complexe, elle a même une pause !" - phoneme_vector = tf.string_to_tensor(sentence).squeeze(0).to(device) - spec, durations, *_ = net.inference(text=phoneme_vector, - return_duration_pitch_energy=True, - utterance_embedding=default_embed, - lang_id=get_language_id(lang).to(device)) - spec = spec.transpose(0, 1).to("cpu").numpy() - duration_splits, label_positions = cumsum_durations(durations.cpu().numpy()) - if not os.path.exists(os.path.join(save_dir, "spec")): - os.makedirs(os.path.join(save_dir, "spec")) - fig, ax = plt.subplots(nrows=1, ncols=1) - lbd.specshow(spec, - ax=ax, - sr=16000, - cmap='GnBu', - y_axis='mel', - x_axis=None, - hop_length=256) - ax.yaxis.set_visible(False) - ax.set_xticks(duration_splits, minor=True) - ax.xaxis.grid(True, which='minor') - ax.set_xticks(label_positions, minor=False) - ax.set_xticklabels(tf.get_phone_string(sentence)) - ax.set_title(sentence) - plt.savefig(os.path.join(os.path.join(save_dir, "spec"), f"{step}_{lang}.png")) - plt.clf() - plt.close() - - -def collate_and_pad(batch): - # text, text_len, speech, speech_len, durations, energy, pitch, utterance condition, language_id - return (pad_sequence([datapoint[0] for datapoint in batch], batch_first=True), - torch.stack([datapoint[1] for datapoint in batch]).squeeze(1), - pad_sequence([datapoint[2] for datapoint in batch], batch_first=True), - torch.stack([datapoint[3] for datapoint in batch]).squeeze(1), - pad_sequence([datapoint[4] for datapoint in batch], batch_first=True), - pad_sequence([datapoint[5] for datapoint in batch], batch_first=True), - pad_sequence([datapoint[6] for datapoint in batch], batch_first=True), - torch.stack([datapoint[7] for datapoint in batch]).squeeze(), - torch.stack([datapoint[8] for datapoint in batch])) diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint_no_Slavic.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint_no_Slavic.py deleted file mode 100644 index b417f3b3..00000000 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint_no_Slavic.py +++ /dev/null @@ -1,382 +0,0 @@ -import random - -import librosa.display as lbd -import matplotlib.pyplot as plt -import torch -import torch.multiprocessing -from torch.cuda.amp import GradScaler -from torch.cuda.amp import autocast -from torch.nn.utils.rnn import pad_sequence -from torch.utils.data.dataloader import DataLoader -from tqdm import tqdm - -from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend -from Preprocessing.ArticulatoryCombinedTextFrontend import get_language_id -from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2 import FastSpeech2 -from Utility.WarmupScheduler import WarmupScheduler -from Utility.corpus_preparation import prepare_fastspeech_corpus -from Utility.path_to_transcript_dicts import * -from Utility.utils import cumsum_durations -from Utility.utils import delete_old_checkpoints -from Utility.utils import get_most_recent_checkpoint - - -def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, find_faulty_samples_mode=False): - torch.manual_seed(131714) - random.seed(131714) - torch.random.manual_seed(131714) - - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}" - - datasets = list() - - base_dir = os.path.join("Models", "FastSpeech2_Meta_no_slav") - if model_dir is not None: - meta_save_dir = model_dir - else: - meta_save_dir = base_dir - os.makedirs(meta_save_dir, exist_ok=True) - - print("Preparing") - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_nancy(), - corpus_dir=os.path.join("Corpora", "Nancy"), - lang="en")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_karlsson(), - corpus_dir=os.path.join("Corpora", "Karlsson"), - lang="de")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10el(), - corpus_dir=os.path.join("Corpora", "meta_Greek"), - lang="el")) - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_spanish_blizzard_train(), - corpus_dir=os.path.join("Corpora", "spanish_blizzard"), - lang="es")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10fi(), - corpus_dir=os.path.join("Corpora", "meta_Finnish"), - lang="fi")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10hu(), - corpus_dir=os.path.join("Corpora", "meta_Hungarian"), - lang="hu")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10nl(), - corpus_dir=os.path.join("Corpora", "meta_Dutch"), - lang="nl")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10fr(), - corpus_dir=os.path.join("Corpora", "meta_French"), - lang="fr")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_ljspeech(), - corpus_dir=os.path.join("Corpora", "LJSpeech"), - lang="en")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts(), - corpus_dir=os.path.join("Corpora", "libri"), - lang="en")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_vctk(), - corpus_dir=os.path.join("Corpora", "vctk"), - lang="en")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_nvidia_hifitts(), - corpus_dir=os.path.join("Corpora", "hifi"), - lang="en")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10es(), - corpus_dir=os.path.join("Corpora", "meta_Spanish"), - lang="es")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_eva(), - corpus_dir=os.path.join("Corpora", "Eva"), - lang="de")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_hokus(), - corpus_dir=os.path.join("Corpora", "Hokus"), - lang="de")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_bernd(), - corpus_dir=os.path.join("Corpora", "Bernd"), - lang="de")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_hui_others(), - corpus_dir=os.path.join("Corpora", "hui_others"), - lang="de")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_thorsten(), - corpus_dir=os.path.join("Corpora", "Thorsten"), - lang="de")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_fluxsing(), - corpus_dir=os.path.join("Corpora", "flux_sing"), - lang="en", - ctc_selection=False)) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_portuguese(), - corpus_dir=os.path.join("Corpora", "mls_porto"), - lang="pt")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_spanish(), - corpus_dir=os.path.join("Corpora", "mls_spanish"), - lang="es")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_french(), - corpus_dir=os.path.join("Corpora", "mls_french"), - lang="fr")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_italian(), - corpus_dir=os.path.join("Corpora", "mls_italian"), - lang="it")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_dutch(), - corpus_dir=os.path.join("Corpora", "mls_dutch"), - lang="nl")) - - if find_faulty_samples_mode: - find_faulty_samples(net=FastSpeech2(lang_embs=100), - datasets=datasets, - device=torch.device("cuda"), - path_to_checkpoint=resume_checkpoint) - else: - train_loop(net=FastSpeech2(lang_embs=100), - device=torch.device("cuda"), - datasets=datasets, - batch_size=4, - save_directory=meta_save_dir, - steps=100000, - steps_per_checkpoint=1000, - lr=0.0001, - path_to_checkpoint=resume_checkpoint, - resume=resume) - - -@torch.no_grad() -def find_faulty_samples(net, - datasets, - device, - path_to_checkpoint): - net = net.to(device) - torch.multiprocessing.set_sharing_strategy('file_system') - check_dict = torch.load(os.path.join(path_to_checkpoint), map_location=device) - net.load_state_dict(check_dict["model"]) - losses = list() - index_pairs = list() - for dataset_index in range(len(datasets)): - for datapoint_index in tqdm(range(len(datasets[dataset_index]))): - loss = net(text_tensors=datasets[dataset_index][datapoint_index][0].unsqueeze(0).to(device), - text_lengths=datasets[dataset_index][datapoint_index][1].to(device), - gold_speech=datasets[dataset_index][datapoint_index][2].unsqueeze(0).to(device), - speech_lengths=datasets[dataset_index][datapoint_index][3].to(device), - gold_durations=datasets[dataset_index][datapoint_index][4].unsqueeze(0).to(device), - gold_pitch=datasets[dataset_index][datapoint_index][6].unsqueeze(0).to(device), # mind the switched order - gold_energy=datasets[dataset_index][datapoint_index][5].unsqueeze(0).to(device), # mind the switched order - utterance_embedding=datasets[dataset_index][datapoint_index][7].unsqueeze(0).to(device), - lang_ids=datasets[dataset_index][datapoint_index][8].unsqueeze(0).to(device), - return_mels=False).squeeze() - if torch.isnan(loss): - print(f"CAREFUL, NAN DETECTED: {dataset_index}, {datapoint_index}") - losses.append(loss.item()) - index_pairs.append((dataset_index, datapoint_index)) - loss_high_to_low = sorted(losses, reverse=True) - print(loss_high_to_low) - threshold = loss_high_to_low[1000] - for index, loss in enumerate(losses): - if loss > threshold: - print(index_pairs[index]) - print(loss) - - -def train_loop(net, - datasets, - device, - save_directory, - batch_size, - steps, - steps_per_checkpoint, - lr, - path_to_checkpoint, - resume=False): - # ============ - # Preparations - # ============ - net = net.to(device) - torch.multiprocessing.set_sharing_strategy('file_system') - train_loaders = list() - train_iters = list() - for dataset in datasets: - train_loaders.append(DataLoader(batch_size=batch_size, - dataset=dataset, - drop_last=True, - num_workers=2, - pin_memory=True, - shuffle=True, - prefetch_factor=5, - collate_fn=collate_and_pad, - persistent_workers=True)) - train_iters.append(iter(train_loaders[-1])) - default_embeddings = {"en": None, "de": None, "el": None, "es": None, "fi": None, "hu": None, "nl": None, "fr": None} - for index, lang in enumerate(["en", "de", "el", "es", "fi", "hu", "nl", "fr"]): - default_embedding = None - for datapoint in datasets[index]: - if default_embedding is None: - default_embedding = datapoint[7].squeeze() - else: - default_embedding = default_embedding + datapoint[7].squeeze() - default_embeddings[lang] = (default_embedding / len(datasets[index])).to(device) - optimizer = torch.optim.RAdam(net.parameters(), lr=lr, eps=1.0e-06, weight_decay=0.0) - grad_scaler = GradScaler() - scheduler = WarmupScheduler(optimizer, warmup_steps=4000) - if resume: - previous_checkpoint = get_most_recent_checkpoint(checkpoint_dir=save_directory) - if previous_checkpoint is not None: - path_to_checkpoint = previous_checkpoint - else: - raise RuntimeError(f"No checkpoint found that can be resumed from in {save_directory}") - step_counter = 0 - train_losses_total = list() - if path_to_checkpoint is not None: - check_dict = torch.load(os.path.join(path_to_checkpoint), map_location=device) - net.load_state_dict(check_dict["model"]) - if resume: - optimizer.load_state_dict(check_dict["optimizer"]) - step_counter = check_dict["step_counter"] - grad_scaler.load_state_dict(check_dict["scaler"]) - scheduler.load_state_dict(check_dict["scheduler"]) - if step_counter > steps: - print("Desired steps already reached in loaded checkpoint.") - return - - net.train() - # ============================= - # Actual train loop starts here - # ============================= - for step in tqdm(range(step_counter, steps)): - batches = [] - for index in range(len(datasets)): - # we get one batch for each task (i.e. language in this case) - try: - batch = next(train_iters[index]) - batches.append(batch) - except StopIteration: - train_iters[index] = iter(train_loaders[index]) - batch = next(train_iters[index]) - batches.append(batch) - train_loss = 0.0 - for batch in batches: - with autocast(): - # we sum the loss for each task, as we would do for the - # second order regular MAML, but we do it only over one - # step (i.e. iterations of inner loop = 1) - train_loss = train_loss + net(text_tensors=batch[0].to(device), - text_lengths=batch[1].to(device), - gold_speech=batch[2].to(device), - speech_lengths=batch[3].to(device), - gold_durations=batch[4].to(device), - gold_pitch=batch[6].to(device), # mind the switched order - gold_energy=batch[5].to(device), # mind the switched order - utterance_embedding=batch[7].to(device), - lang_ids=batch[8].to(device), - return_mels=False) - # then we directly update our meta-parameters without - # the need for any task specific parameters - train_losses_total.append(train_loss.item()) - optimizer.zero_grad() - grad_scaler.scale(train_loss).backward() - grad_scaler.unscale_(optimizer) - torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0, error_if_nonfinite=False) - grad_scaler.step(optimizer) - grad_scaler.update() - scheduler.step() - - if step % steps_per_checkpoint == 0: - # ============================== - # Enough steps for some insights - # ============================== - net.eval() - print(f"Total Loss: {round(sum(train_losses_total) / len(train_losses_total), 3)}") - train_losses_total = list() - torch.save({ - "model" : net.state_dict(), - "optimizer" : optimizer.state_dict(), - "scaler" : grad_scaler.state_dict(), - "scheduler" : scheduler.state_dict(), - "step_counter": step, - "default_emb" : default_embeddings["en"] - }, - os.path.join(save_directory, "checkpoint_{}.pt".format(step))) - delete_old_checkpoints(save_directory, keep=5) - for lang in ["en", "de", "el", "es", "fi", "hu", "nl", "fr"]: - plot_progress_spec(net=net, - device=device, - lang=lang, - save_dir=save_directory, - step=step, - utt_embeds=default_embeddings) - net.train() - - -@torch.inference_mode() -def plot_progress_spec(net, device, save_dir, step, lang, utt_embeds): - tf = ArticulatoryCombinedTextFrontend(language=lang) - sentence = "" - default_embed = utt_embeds[lang] - if lang == "en": - sentence = "This is a complex sentence, it even has a pause!" - elif lang == "de": - sentence = "Dies ist ein komplexer Satz, er hat sogar eine Pause!" - elif lang == "el": - sentence = "Αυτή είναι μια σύνθετη πρόταση, έχει ακόμη και παύση!" - elif lang == "es": - sentence = "Esta es una oración compleja, ¡incluso tiene una pausa!" - elif lang == "fi": - sentence = "Tämä on monimutkainen lause, sillä on jopa tauko!" - elif lang == "ru": - sentence = "Это сложное предложение, в нем даже есть пауза!" - elif lang == "hu": - sentence = "Ez egy összetett mondat, még szünet is van benne!" - elif lang == "nl": - sentence = "Dit is een complexe zin, er zit zelfs een pauze in!" - elif lang == "fr": - sentence = "C'est une phrase complexe, elle a même une pause !" - phoneme_vector = tf.string_to_tensor(sentence).squeeze(0).to(device) - spec, durations, *_ = net.inference(text=phoneme_vector, - return_duration_pitch_energy=True, - utterance_embedding=default_embed, - lang_id=get_language_id(lang).to(device)) - spec = spec.transpose(0, 1).to("cpu").numpy() - duration_splits, label_positions = cumsum_durations(durations.cpu().numpy()) - if not os.path.exists(os.path.join(save_dir, "spec")): - os.makedirs(os.path.join(save_dir, "spec")) - fig, ax = plt.subplots(nrows=1, ncols=1) - lbd.specshow(spec, - ax=ax, - sr=16000, - cmap='GnBu', - y_axis='mel', - x_axis=None, - hop_length=256) - ax.yaxis.set_visible(False) - ax.set_xticks(duration_splits, minor=True) - ax.xaxis.grid(True, which='minor') - ax.set_xticks(label_positions, minor=False) - ax.set_xticklabels(tf.get_phone_string(sentence)) - ax.set_title(sentence) - plt.savefig(os.path.join(os.path.join(save_dir, "spec"), f"{step}_{lang}.png")) - plt.clf() - plt.close() - - -def collate_and_pad(batch): - # text, text_len, speech, speech_len, durations, energy, pitch, utterance condition, language_id - return (pad_sequence([datapoint[0] for datapoint in batch], batch_first=True), - torch.stack([datapoint[1] for datapoint in batch]).squeeze(1), - pad_sequence([datapoint[2] for datapoint in batch], batch_first=True), - torch.stack([datapoint[3] for datapoint in batch]).squeeze(1), - pad_sequence([datapoint[4] for datapoint in batch], batch_first=True), - pad_sequence([datapoint[5] for datapoint in batch], batch_first=True), - pad_sequence([datapoint[6] for datapoint in batch], batch_first=True), - torch.stack([datapoint[7] for datapoint in batch]).squeeze(), - torch.stack([datapoint[8] for datapoint in batch])) diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint_rus_finetune.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint_rus_finetune.py deleted file mode 100644 index fa8c4e5a..00000000 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint_rus_finetune.py +++ /dev/null @@ -1,399 +0,0 @@ -import random - -import librosa.display as lbd -import matplotlib.pyplot as plt -import torch -import torch.multiprocessing -from torch.cuda.amp import GradScaler -from torch.cuda.amp import autocast -from torch.nn.utils.rnn import pad_sequence -from torch.utils.data.dataloader import DataLoader -from tqdm import tqdm - -from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend -from Preprocessing.ArticulatoryCombinedTextFrontend import get_language_id -from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2 import FastSpeech2 -from Utility.WarmupScheduler import WarmupScheduler -from Utility.corpus_preparation import prepare_fastspeech_corpus -from Utility.path_to_transcript_dicts import * -from Utility.utils import cumsum_durations -from Utility.utils import delete_old_checkpoints -from Utility.utils import get_most_recent_checkpoint - - -def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, find_faulty_samples_mode=False): - torch.manual_seed(131714) - random.seed(131714) - torch.random.manual_seed(131714) - - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}" - - datasets = list() - - resume_checkpoint = "Models/FastSpeech2_Russian_low_resource/best.pt" - - base_dir = os.path.join("Models", "FastSpeech2_Meta_joint_finetune_russian") - if model_dir is not None: - meta_save_dir = model_dir - else: - meta_save_dir = base_dir - os.makedirs(meta_save_dir, exist_ok=True) - - print("Preparing") - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10ru(), - corpus_dir=os.path.join("Corpora", "Russian_low_resource"), - lang="ru")) - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_nancy(), - corpus_dir=os.path.join("Corpora", "Nancy"), - lang="en")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_karlsson(), - corpus_dir=os.path.join("Corpora", "Karlsson"), - lang="de")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10el(), - corpus_dir=os.path.join("Corpora", "meta_Greek"), - lang="el")) - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_spanish_blizzard_train(), - corpus_dir=os.path.join("Corpora", "spanish_blizzard"), - lang="es")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10fi(), - corpus_dir=os.path.join("Corpora", "meta_Finnish"), - lang="fi")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10hu(), - corpus_dir=os.path.join("Corpora", "meta_Hungarian"), - lang="hu")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10nl(), - corpus_dir=os.path.join("Corpora", "meta_Dutch"), - lang="nl")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10fr(), - corpus_dir=os.path.join("Corpora", "meta_French"), - lang="fr")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_ljspeech(), - corpus_dir=os.path.join("Corpora", "LJSpeech"), - lang="en")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_libritts(), - corpus_dir=os.path.join("Corpora", "libri"), - lang="en")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_vctk(), - corpus_dir=os.path.join("Corpora", "vctk"), - lang="en")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_nvidia_hifitts(), - corpus_dir=os.path.join("Corpora", "hifi"), - lang="en")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_css10es(), - corpus_dir=os.path.join("Corpora", "meta_Spanish"), - lang="es")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_eva(), - corpus_dir=os.path.join("Corpora", "Eva"), - lang="de")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_hokus(), - corpus_dir=os.path.join("Corpora", "Hokus"), - lang="de")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_bernd(), - corpus_dir=os.path.join("Corpora", "Bernd"), - lang="de")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_hui_others(), - corpus_dir=os.path.join("Corpora", "hui_others"), - lang="de")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_thorsten(), - corpus_dir=os.path.join("Corpora", "Thorsten"), - lang="de")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_fluxsing(), - corpus_dir=os.path.join("Corpora", "flux_sing"), - lang="en", - ctc_selection=False)) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_portuguese(), - corpus_dir=os.path.join("Corpora", "mls_porto"), - lang="pt")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_spanish(), - corpus_dir=os.path.join("Corpora", "mls_spanish"), - lang="es")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_french(), - corpus_dir=os.path.join("Corpora", "mls_french"), - lang="fr")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_italian(), - corpus_dir=os.path.join("Corpora", "mls_italian"), - lang="it")) - - datasets.append(prepare_fastspeech_corpus(transcript_dict=build_path_to_transcript_dict_mls_dutch(), - corpus_dir=os.path.join("Corpora", "mls_dutch"), - lang="nl")) - - if find_faulty_samples_mode: - find_faulty_samples(net=FastSpeech2(lang_embs=100), - datasets=datasets, - device=torch.device("cuda"), - path_to_checkpoint=resume_checkpoint) - else: - train_loop(net=FastSpeech2(lang_embs=100), - device=torch.device("cuda"), - datasets=datasets, - batch_size=4, - save_directory=meta_save_dir, - steps=100000, - steps_per_checkpoint=1000, - lr=0.0001, - path_to_checkpoint=resume_checkpoint, - resume=resume) - - -@torch.no_grad() -def find_faulty_samples(net, - datasets, - device, - path_to_checkpoint): - net = net.to(device) - torch.multiprocessing.set_sharing_strategy('file_system') - check_dict = torch.load(os.path.join(path_to_checkpoint), map_location=device) - net.load_state_dict(check_dict["model"]) - losses = list() - index_pairs = list() - for dataset_index in range(len(datasets)): - for datapoint_index in tqdm(range(len(datasets[dataset_index]))): - loss = net(text_tensors=datasets[dataset_index][datapoint_index][0].unsqueeze(0).to(device), - text_lengths=datasets[dataset_index][datapoint_index][1].to(device), - gold_speech=datasets[dataset_index][datapoint_index][2].unsqueeze(0).to(device), - speech_lengths=datasets[dataset_index][datapoint_index][3].to(device), - gold_durations=datasets[dataset_index][datapoint_index][4].unsqueeze(0).to(device), - gold_pitch=datasets[dataset_index][datapoint_index][6].unsqueeze(0).to(device), # mind the switched order - gold_energy=datasets[dataset_index][datapoint_index][5].unsqueeze(0).to(device), # mind the switched order - utterance_embedding=datasets[dataset_index][datapoint_index][7].unsqueeze(0).to(device), - lang_ids=datasets[dataset_index][datapoint_index][8].unsqueeze(0).to(device), - return_mels=False).squeeze() - if torch.isnan(loss): - print(f"CAREFUL, NAN DETECTED: {dataset_index}, {datapoint_index}") - losses.append(loss.item()) - index_pairs.append((dataset_index, datapoint_index)) - loss_high_to_low = sorted(losses, reverse=True) - print(loss_high_to_low) - threshold = loss_high_to_low[1000] - for index, loss in enumerate(losses): - if loss > threshold: - print(index_pairs[index]) - print(loss) - - -def train_loop(net, - datasets, - device, - save_directory, - batch_size, - steps, - steps_per_checkpoint, - lr, - path_to_checkpoint, - resume=False): - # ============ - # Preparations - # ============ - net = net.to(device) - torch.multiprocessing.set_sharing_strategy('file_system') - train_loaders = list() - train_iters = list() - for i, dataset in enumerate(datasets): - if i == 0: - train_loaders.append(DataLoader(batch_size=12, - dataset=dataset, - drop_last=True, - num_workers=2, - pin_memory=True, - shuffle=True, - prefetch_factor=5, - collate_fn=collate_and_pad, - persistent_workers=True)) - else: - train_loaders.append(DataLoader(batch_size=2, - dataset=dataset, - drop_last=True, - num_workers=2, - pin_memory=True, - shuffle=True, - prefetch_factor=5, - collate_fn=collate_and_pad, - persistent_workers=True)) - train_iters.append(iter(train_loaders[-1])) - default_embeddings = {"ru": None, "en": None, "de": None, "el": None, "es": None, "fi": None, "hu": None, "nl": None, "fr": None} - for index, lang in enumerate(["ru", "en", "de", "el", "es", "fi", "hu", "nl", "fr"]): - default_embedding = None - for datapoint in datasets[index]: - if default_embedding is None: - default_embedding = datapoint[7].squeeze() - else: - default_embedding = default_embedding + datapoint[7].squeeze() - default_embeddings[lang] = (default_embedding / len(datasets[index])).to(device) - optimizer = torch.optim.RAdam(net.parameters(), lr=lr, eps=1.0e-06, weight_decay=0.0) - grad_scaler = GradScaler() - scheduler = WarmupScheduler(optimizer, warmup_steps=4000) - if resume: - previous_checkpoint = get_most_recent_checkpoint(checkpoint_dir=save_directory) - if previous_checkpoint is not None: - path_to_checkpoint = previous_checkpoint - else: - raise RuntimeError(f"No checkpoint found that can be resumed from in {save_directory}") - step_counter = 0 - train_losses_total = list() - if path_to_checkpoint is not None: - check_dict = torch.load(os.path.join(path_to_checkpoint), map_location=device) - net.load_state_dict(check_dict["model"]) - if resume: - optimizer.load_state_dict(check_dict["optimizer"]) - step_counter = check_dict["step_counter"] - grad_scaler.load_state_dict(check_dict["scaler"]) - scheduler.load_state_dict(check_dict["scheduler"]) - if step_counter > steps: - print("Desired steps already reached in loaded checkpoint.") - return - - net.train() - # ============================= - # Actual train loop starts here - # ============================= - for step in tqdm(range(step_counter, steps)): - batches = [] - for index in range(len(datasets)): - # we get one batch for each task (i.e. language in this case) - try: - batch = next(train_iters[index]) - batches.append(batch) - except StopIteration: - train_iters[index] = iter(train_loaders[index]) - batch = next(train_iters[index]) - batches.append(batch) - train_loss = 0.0 - for batch in batches: - with autocast(): - # we sum the loss for each task, as we would do for the - # second order regular MAML, but we do it only over one - # step (i.e. iterations of inner loop = 1) - train_loss = train_loss + net(text_tensors=batch[0].to(device), - text_lengths=batch[1].to(device), - gold_speech=batch[2].to(device), - speech_lengths=batch[3].to(device), - gold_durations=batch[4].to(device), - gold_pitch=batch[6].to(device), # mind the switched order - gold_energy=batch[5].to(device), # mind the switched order - utterance_embedding=batch[7].to(device), - lang_ids=batch[8].to(device), - return_mels=False) - # then we directly update our meta-parameters without - # the need for any task specific parameters - train_losses_total.append(train_loss.item()) - optimizer.zero_grad() - grad_scaler.scale(train_loss).backward() - grad_scaler.unscale_(optimizer) - torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0, error_if_nonfinite=False) - grad_scaler.step(optimizer) - grad_scaler.update() - scheduler.step() - - if step % steps_per_checkpoint == 0: - # ============================== - # Enough steps for some insights - # ============================== - net.eval() - print(f"Total Loss: {round(sum(train_losses_total) / len(train_losses_total), 3)}") - train_losses_total = list() - torch.save({ - "model": net.state_dict(), - "optimizer": optimizer.state_dict(), - "scaler": grad_scaler.state_dict(), - "scheduler": scheduler.state_dict(), - "step_counter": step, - "default_emb": default_embeddings["en"] - }, - os.path.join(save_directory, "checkpoint_{}.pt".format(step))) - delete_old_checkpoints(save_directory, keep=5) - for lang in ["ru", "en", "de", "el", "es", "fi", "hu", "nl", "fr"]: - plot_progress_spec(net=net, - device=device, - lang=lang, - save_dir=save_directory, - step=step, - utt_embeds=default_embeddings) - net.train() - - -@torch.inference_mode() -def plot_progress_spec(net, device, save_dir, step, lang, utt_embeds): - tf = ArticulatoryCombinedTextFrontend(language=lang) - sentence = "" - default_embed = utt_embeds[lang] - if lang == "en": - sentence = "This is a complex sentence, it even has a pause!" - elif lang == "de": - sentence = "Dies ist ein komplexer Satz, er hat sogar eine Pause!" - elif lang == "el": - sentence = "Αυτή είναι μια σύνθετη πρόταση, έχει ακόμη και παύση!" - elif lang == "es": - sentence = "Esta es una oración compleja, ¡incluso tiene una pausa!" - elif lang == "fi": - sentence = "Tämä on monimutkainen lause, sillä on jopa tauko!" - elif lang == "ru": - sentence = "Это сложное предложение, в нем даже есть пауза!" - elif lang == "hu": - sentence = "Ez egy összetett mondat, még szünet is van benne!" - elif lang == "nl": - sentence = "Dit is een complexe zin, er zit zelfs een pauze in!" - elif lang == "fr": - sentence = "C'est une phrase complexe, elle a même une pause !" - phoneme_vector = tf.string_to_tensor(sentence).squeeze(0).to(device) - spec, durations, *_ = net.inference(text=phoneme_vector, - return_duration_pitch_energy=True, - utterance_embedding=default_embed, - lang_id=get_language_id(lang).to(device)) - spec = spec.transpose(0, 1).to("cpu").numpy() - duration_splits, label_positions = cumsum_durations(durations.cpu().numpy()) - if not os.path.exists(os.path.join(save_dir, "spec")): - os.makedirs(os.path.join(save_dir, "spec")) - fig, ax = plt.subplots(nrows=1, ncols=1) - lbd.specshow(spec, - ax=ax, - sr=16000, - cmap='GnBu', - y_axis='mel', - x_axis=None, - hop_length=256) - ax.yaxis.set_visible(False) - ax.set_xticks(duration_splits, minor=True) - ax.xaxis.grid(True, which='minor') - ax.set_xticks(label_positions, minor=False) - ax.set_xticklabels(tf.get_phone_string(sentence)) - ax.set_title(sentence) - plt.savefig(os.path.join(os.path.join(save_dir, "spec"), f"{step}_{lang}.png")) - plt.clf() - plt.close() - - -def collate_and_pad(batch): - # text, text_len, speech, speech_len, durations, energy, pitch, utterance condition, language_id - return (pad_sequence([datapoint[0] for datapoint in batch], batch_first=True), - torch.stack([datapoint[1] for datapoint in batch]).squeeze(1), - pad_sequence([datapoint[2] for datapoint in batch], batch_first=True), - torch.stack([datapoint[3] for datapoint in batch]).squeeze(1), - pad_sequence([datapoint[4] for datapoint in batch], batch_first=True), - pad_sequence([datapoint[5] for datapoint in batch], batch_first=True), - pad_sequence([datapoint[6] for datapoint in batch], batch_first=True), - torch.stack([datapoint[7] for datapoint in batch]).squeeze(), - torch.stack([datapoint[8] for datapoint in batch])) diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_RussianLowResource.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_RussianLowResource.py deleted file mode 100644 index 470ae6da..00000000 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_RussianLowResource.py +++ /dev/null @@ -1,73 +0,0 @@ -import random - -import soundfile -import torch - -from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2 import FastSpeech2 -from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.fastspeech2_train_loop import train_loop -from Utility.corpus_preparation import prepare_fastspeech_corpus -from Utility.path_to_transcript_dicts import * - - -def run(gpu_id, resume_checkpoint, finetune, model_dir, resume): - if gpu_id == "cpu": - os.environ["CUDA_VISIBLE_DEVICES"] = "" - device = torch.device("cpu") - - else: - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}" - device = torch.device("cuda") - - torch.manual_seed(131714) - random.seed(131714) - torch.random.manual_seed(131714) - - print("Preparing") - - if model_dir is not None: - save_dir = model_dir - else: - save_dir = os.path.join("Models", "FastSpeech2_Russian_low_resource") - os.makedirs(save_dir, exist_ok=True) - - path_to_transcript_dict_ = build_path_to_transcript_dict_css10ru() - path_to_transcript_dict = dict() - - paths = list(path_to_transcript_dict_.keys()) - used_samples = set() - total_len = 0.0 - while total_len < 5.0 * 60.0: - path = random.choice(paths) - x, sr = soundfile.read(path) - duration = len(x) / sr - if 10 > duration > 5 and path not in used_samples: - used_samples.add(path) - total_len += duration - - print(f"Collected {total_len / 60.0} minutes worth of samples.") - - for key in path_to_transcript_dict_: - if key in used_samples: - path_to_transcript_dict[key] = path_to_transcript_dict_[key] - - train_set = prepare_fastspeech_corpus(transcript_dict=path_to_transcript_dict, - corpus_dir=os.path.join("Corpora", "Russian_low_resource"), - lang="ru") - - model = FastSpeech2(lang_embs=100) - # because we want to finetune it, we treat it as multilingual and multispeaker model, even though it only has one speaker - - print("Training model") - train_loop(net=model, - train_dataset=train_set, - device=device, - save_directory=save_dir, - steps=10000, - batch_size=32, - lang="ru", - lr=0.0001, - epochs_per_save=20, - path_to_checkpoint="Models/FastSpeech2_Meta_no_slav/best.pt", - fine_tune=True, - resume=resume) diff --git a/Utility/EvaluationScripts/SpeakerVisualization.py b/Utility/EvaluationScripts/SpeakerVisualization.py index f95479eb..9855ae81 100644 --- a/Utility/EvaluationScripts/SpeakerVisualization.py +++ b/Utility/EvaluationScripts/SpeakerVisualization.py @@ -1,13 +1,13 @@ import matplotlib -import numpy import soundfile as sf +import torch from matplotlib import cm from matplotlib import pyplot as plt matplotlib.use("tkAgg") from sklearn.manifold import TSNE from sklearn.decomposition import PCA - +import numpy from tqdm import tqdm from Preprocessing.ProsodicConditionExtractor import ProsodicConditionExtractor @@ -20,15 +20,16 @@ def __init__(self, sr=48000, device="cpu"): Args: sr: The sampling rate of the audios you want to visualize. """ - self.tsne = TSNE(n_jobs=-1) + self.tsne = TSNE(n_jobs=-1, n_iter_without_progress=4000, n_iter=20000) self.pca = PCA(n_components=2) self.pros_cond_ext = ProsodicConditionExtractor(sr=sr, device=device) self.sr = sr - def visualize_speaker_embeddings(self, label_to_filepaths, title_of_plot, save_file_path=None, include_pca=True, legend=True): + def visualize_speaker_embeddings(self, label_to_filepaths, title_of_plot, save_file_path=None, include_pca=True, legend=True, colors=None): label_list = list() embedding_list = list() - for label in tqdm(label_to_filepaths): + ordered_labels = sorted(list(label_to_filepaths.keys())) + for label in tqdm(ordered_labels): for filepath in tqdm(label_to_filepaths[label]): wave, sr = sf.read(filepath) if len(wave) / sr < 1: @@ -48,7 +49,8 @@ def visualize_speaker_embeddings(self, label_to_filepaths, title_of_plot, save_f labels=label_list, title=title_of_plot + " t-SNE" if include_pca else title_of_plot, save_file_path=save_file_path, - legend=legend) + legend=legend, + colors=colors) if include_pca: dimensionality_reduced_embeddings_pca = self.pca.fit_transform(embeddings_as_array) @@ -56,12 +58,14 @@ def visualize_speaker_embeddings(self, label_to_filepaths, title_of_plot, save_f labels=label_list, title=title_of_plot + " PCA", save_file_path=save_file_path, - legend=legend) + legend=legend, + colors=colors) - def _plot_embeddings(self, projected_data, labels, title, save_file_path, legend): - colors = cm.gist_rainbow(numpy.linspace(0, 1, len(set(labels)))) + def _plot_embeddings(self, projected_data, labels, title, save_file_path, legend, colors): + if colors is None: + colors = cm.gist_rainbow(numpy.linspace(0, 1, len(set(labels)))) label_to_color = dict() - for index, label in enumerate(list(set(labels))): + for index, label in enumerate(sorted(list(set(labels)))): label_to_color[label] = colors[index] labels_to_points_x = dict() @@ -74,7 +78,7 @@ def _plot_embeddings(self, projected_data, labels, title, save_file_path, legend labels_to_points_y[label].append(projected_data[index][1]) fig, ax = plt.subplots() - for label in set(labels): + for label in sorted(list(set(labels))): x = numpy.array(labels_to_points_x[label]) y = numpy.array(labels_to_points_y[label]) ax.scatter(x=x, @@ -93,3 +97,29 @@ def _plot_embeddings(self, projected_data, labels, title, save_file_path, legend else: plt.show() plt.close() + + def calculate_spk_sim(self, reference_path, comparisons): + embedding_list = list() + for filepath in tqdm(comparisons): + wave, sr = sf.read(filepath) + if len(wave) / sr < 1: + continue + if self.sr != sr: + print("One of the Audios you included doesn't match the sampling rate of this visualizer object, " + "creating a new condition extractor. Results will be correct, but if there are too many cases " + "of changing samplingrate, this will run very slowly.") + self.pros_cond_ext = ProsodicConditionExtractor(sr=sr) + self.sr = sr + embedding_list.append(self.pros_cond_ext.extract_condition_from_reference_wave(wave).squeeze()) + + wave, sr = sf.read(reference_path) + if self.sr != sr: + self.pros_cond_ext = ProsodicConditionExtractor(sr=sr) + self.sr = sr + reference_embedding = self.pros_cond_ext.extract_condition_from_reference_wave(wave).squeeze() + + sims = list() + for comp_emb in embedding_list: + sims.append(torch.cosine_similarity(reference_embedding, comp_emb, dim=0)) + + return (sum(sims) / len(sims)).item(), numpy.std(sims) diff --git a/run_speaker_visualization.py b/run_speaker_visualization.py index a45f8e0c..730bb545 100644 --- a/run_speaker_visualization.py +++ b/run_speaker_visualization.py @@ -33,16 +33,64 @@ def visualize_libritts(): vs.visualize_speaker_embeddings(label_to_filepaths=ltf, title_of_plot="Embeddings of a Subset of LibriTTS") +def visualize_adept_experiment(): + vs = Visualizer() + ltf = dict() + for exp in os.listdir("audios/adept_plot"): + for sample in os.listdir(f"audios/adept_plot/{exp}"): + + spk_id = sample.split("_")[1].split(".")[0] + if spk_id == "ad00": + spk_label = "Female" + elif spk_id == "ad01": + spk_label = "Male" + else: + spk_label = "Other Female" + + if exp == "human": + exp_label = "Human" + elif exp == "same_voice_diff_style": + exp_label = "Unconditioned" + else: + exp_label = "Cloned" + + plot_label = f"{spk_label} - {exp_label}" + + if exp_label != "Human" and spk_label != "Other Female": + if plot_label not in ltf: + ltf[plot_label] = list() + ltf[plot_label].append(f"audios/adept_plot/{exp}/{sample}") + + vs.visualize_speaker_embeddings(label_to_filepaths=ltf, + title_of_plot="Speakers with and without Cloning", + include_pca=False, + colors=["limegreen", "darkgreen", "dodgerblue", "darkblue"]) + + def visualize_speakers_languages_crossover(): ltf = dict() vs = Visualizer() - for file in os.listdir("speakers_for_plotting"): + for file in os.listdir("audios/speakers_for_plotting"): label = file.split("_")[0].capitalize() + " Speaker" if label not in ltf: ltf[label] = list() - ltf[label].append(f"speakers_for_plotting/{file}") + ltf[label].append(f"audios/speakers_for_plotting/{file}") vs.visualize_speaker_embeddings(label_to_filepaths=ltf, title_of_plot="Speakers Across Languages", include_pca=False) +def calculate_spk_sims_multiling(): + ltf = dict() + vs = Visualizer() + for file in os.listdir("audios/speakers_for_plotting"): + label = file.split("_")[0] + if label not in ltf: + ltf[label] = list() + ltf[label].append(f"audios/speakers_for_plotting/{file}") + for reference in os.listdir("audios/multilanguage_references"): + label = reference.split(".")[0] + print(label) + print(vs.calculate_spk_sim(f"audios/multilanguage_references/{reference}", ltf[label])) + + if __name__ == '__main__': - visualize_speakers_languages_crossover() + calculate_spk_sims_multiling() diff --git a/run_training_pipeline.py b/run_training_pipeline.py index a84409cc..e513f096 100644 --- a/run_training_pipeline.py +++ b/run_training_pipeline.py @@ -7,15 +7,7 @@ from TrainingInterfaces.TrainingPipelines.FastSpeech2_Karlsson import run as karlsson from TrainingInterfaces.TrainingPipelines.FastSpeech2_LJ import run as lj from TrainingInterfaces.TrainingPipelines.FastSpeech2_LibriTTS import run as libri -from TrainingInterfaces.TrainingPipelines.FastSpeech2_LibriTTS_600 import run as libri600 -from TrainingInterfaces.TrainingPipelines.FastSpeech2_LibriTTS_asr_out import run as asr_out -from TrainingInterfaces.TrainingPipelines.FastSpeech2_LibriTTS_asr_phn import run as asr_phn -from TrainingInterfaces.TrainingPipelines.FastSpeech2_LibriTTS_asr_phn_600 import run as phn600 from TrainingInterfaces.TrainingPipelines.FastSpeech2_MetaCheckpoint import run as meta_fast -from TrainingInterfaces.TrainingPipelines.FastSpeech2_MetaCheckpoint_germ_finetune import run as low_ger -from TrainingInterfaces.TrainingPipelines.FastSpeech2_MetaCheckpoint_no_Germanic import run as no_ger -from TrainingInterfaces.TrainingPipelines.FastSpeech2_MetaCheckpoint_no_Slavic import run as no_slav -from TrainingInterfaces.TrainingPipelines.FastSpeech2_MetaCheckpoint_rus_finetune import run as low_rus from TrainingInterfaces.TrainingPipelines.FastSpeech2_Nancy import run as nancy from TrainingInterfaces.TrainingPipelines.FastSpeech2_RussianSingle import run as single_rus from TrainingInterfaces.TrainingPipelines.HiFiGAN_combined import run as hifigan_combined @@ -29,17 +21,9 @@ "nancy" : nancy, "hifi_combined": hifigan_combined, "aligner" : aligner, - "no_ger" : no_ger, - "no_slav" : no_slav, - "low_rus" : low_rus, - "low_ger" : low_ger, "single_ger" : single_ger, "single_rus" : single_rus, "full_ger" : full_ger, - "asr_out" : asr_out, - "asr_phn" : asr_phn, - "phn600" : phn600, - "libri600" : libri600, "english" : english } From 601aed23fd96b549d1f2402d0b29c82e29202a21 Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Mon, 14 Mar 2022 15:13:38 +0100 Subject: [PATCH 25/33] run evaluation on poetry data --- run_evaluation.py | 91 +++++++++++++++++------------------------------ 1 file changed, 32 insertions(+), 59 deletions(-) diff --git a/run_evaluation.py b/run_evaluation.py index 41f34ac7..d219af83 100644 --- a/run_evaluation.py +++ b/run_evaluation.py @@ -3,67 +3,40 @@ from tqdm import tqdm from Utility.EvaluationScripts.audio_vs_audio import ffe -# from Utility.EvaluationScripts.playground_audio_vs_audio_poem import get_pitch_curves_abc -from Utility.EvaluationScripts.audio_vs_audio import gpe from Utility.EvaluationScripts.audio_vs_audio import mcd_with_warping -from Utility.EvaluationScripts.audio_vs_audio import vde -# get_pitch_curves_abc(f"audios/ps2/PoetryStudy/Set5/s5_p1_ref2.wav", f"audios/ps2/PoetryStudy/Set5/s5_p1_ref1.wav", f"audios/ps2/PoetryStudy/Set5/s5_p1_base2_pros_1.wav") +mcd_lyric_cloned = list() +mcd_lyric_uncond = list() +mcd_prose_cloned = list() +mcd_prose_uncond = list() -mcd_same_style = list() -mcd_diff_style = list() +ffe_lyric_cloned = list() +ffe_lyric_uncond = list() +ffe_prose_cloned = list() +ffe_prose_uncond = list() -ffe_same_style = list() -ffe_diff_style = list() - -gpe_same_style = list() -gpe_diff_style = list() - -vde_same_style = list() -vde_diff_style = list() - -for file in tqdm(os.listdir("audios/adept/human")): +for file in tqdm(os.listdir("audios/evaluation/human")): if file.endswith(".wav"): - mcd_same_style.append(mcd_with_warping(f"audios/adept/human/{file}", f"audios/adept/same_voice_same_style/{file}")) - vde_same_style.append(vde(f"audios/adept/human/{file}", f"audios/adept/same_voice_same_style/{file}")) - gpe_same_style.append(gpe(f"audios/adept/human/{file}", f"audios/adept/same_voice_same_style/{file}")) - ffe_same_style.append(ffe(f"audios/adept/human/{file}", f"audios/adept/same_voice_same_style/{file}")) - - mcd_diff_style.append(mcd_with_warping(f"audios/adept/human/{file}", f"audios/adept/same_voice_diff_style/{file}")) - vde_diff_style.append(vde(f"audios/adept/human/{file}", f"audios/adept/same_voice_diff_style/{file}")) - gpe_diff_style.append(gpe(f"audios/adept/human/{file}", f"audios/adept/same_voice_diff_style/{file}")) - ffe_diff_style.append(ffe(f"audios/adept/human/{file}", f"audios/adept/same_voice_diff_style/{file}")) - -print(mcd_same_style) -print(vde_same_style) -print(gpe_same_style) -print(ffe_same_style) - -print(mcd_diff_style) -print(vde_diff_style) -print(gpe_diff_style) -print(ffe_diff_style) - -print((1 / len(mcd_same_style)) * sum(mcd_same_style)) -print((1 / len(vde_same_style)) * sum(vde_same_style)) -print((1 / len(gpe_same_style)) * sum(gpe_same_style)) -print((1 / len(ffe_same_style)) * sum(ffe_same_style)) - -print((1 / len(mcd_diff_style)) * sum(mcd_diff_style)) -print((1 / len(vde_diff_style)) * sum(vde_diff_style)) -print((1 / len(gpe_diff_style)) * sum(gpe_diff_style)) -print((1 / len(ffe_diff_style)) * sum(ffe_diff_style)) - -""" -Results on ADEPT - -25.628487893016743 -0.3197801087172897 -0.6701621670474431 -0.6701621670474431 - -5.585193124795971 -0.4236358253409133 -0.7842741112107358 -0.7842741112107358 -""" + mcd_lyric_cloned.append(mcd_with_warping(f"audios/evaluation/human/{file}", + f"audios/evaluation/poetry_cloned/{file.split('.')[0]}_poetic_cloned.wav")) + mcd_lyric_uncond.append(mcd_with_warping(f"audios/evaluation/human/{file}", + f"audios/evaluation/poetry_unconditional/{file.split('.')[0]}_poetic_uncond.wav")) + mcd_prose_cloned.append(mcd_with_warping(f"audios/evaluation/human/{file}", + f"audios/evaluation/prosa_cloned/{file.split('.')[0]}_prosa_cloned.wav")) + mcd_prose_uncond.append(mcd_with_warping(f"audios/evaluation/human/{file}", + f"audios/evaluation/prosa_unconditional/{file.split('.')[0]}_prosa_uncond.wav")) + + ffe_lyric_cloned.append(ffe(f"audios/evaluation/human/{file}", f"audios/evaluation/poetry_cloned/{file.split('.')[0]}_poetic_cloned.wav")) + ffe_lyric_uncond.append(ffe(f"audios/evaluation/human/{file}", f"audios/evaluation/poetry_unconditional/{file.split('.')[0]}_poetic_uncond.wav")) + ffe_prose_cloned.append(ffe(f"audios/evaluation/human/{file}", f"audios/evaluation/prosa_cloned/{file.split('.')[0]}_prosa_cloned.wav")) + ffe_prose_uncond.append(ffe(f"audios/evaluation/human/{file}", f"audios/evaluation/prosa_unconditional/{file.split('.')[0]}_prosa_uncond.wav")) + +print((1 / len(mcd_lyric_cloned)) * sum(mcd_lyric_cloned)) +print((1 / len(mcd_lyric_uncond)) * sum(mcd_lyric_uncond)) +print((1 / len(mcd_prose_cloned)) * sum(mcd_prose_cloned)) +print((1 / len(mcd_prose_uncond)) * sum(mcd_prose_uncond)) + +print((1 / len(ffe_lyric_cloned)) * sum(ffe_lyric_cloned)) +print((1 / len(ffe_lyric_uncond)) * sum(ffe_lyric_uncond)) +print((1 / len(ffe_prose_cloned)) * sum(ffe_prose_cloned)) +print((1 / len(ffe_prose_uncond)) * sum(ffe_prose_uncond)) From b0b40923642c274d7cd4520936613b2e8cffda91 Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Mon, 14 Mar 2022 17:43:51 +0100 Subject: [PATCH 26/33] use torch to speed up dtw computation --- Utility/EvaluationScripts/audio_vs_audio.py | 23 +- Utility/EvaluationScripts/soft_dtw.py | 359 ++++++++++++++++++++ run_evaluation.py | 18 +- 3 files changed, 387 insertions(+), 13 deletions(-) create mode 100644 Utility/EvaluationScripts/soft_dtw.py diff --git a/Utility/EvaluationScripts/audio_vs_audio.py b/Utility/EvaluationScripts/audio_vs_audio.py index c9c60a90..e67d0e6a 100644 --- a/Utility/EvaluationScripts/audio_vs_audio.py +++ b/Utility/EvaluationScripts/audio_vs_audio.py @@ -3,6 +3,7 @@ import matplotlib.pyplot as plt import numpy import soundfile as sf +import torch from numpy import inf from numpy import ndim from numpy import zeros @@ -11,6 +12,7 @@ from Preprocessing.AudioPreprocessor import AudioPreprocessor from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.PitchCalculator import Dio +from Utility.EvaluationScripts.soft_dtw import SoftDTW def vde(path_1, path_2): @@ -72,9 +74,9 @@ def ffe(path_1, path_2): def mcd_with_warping(path_1, path_2): """ - calculate mel cepstral distortion between two unaligned sequences by first performing alignment with warping and then calculating the MSE between them. + calculate mel cepstral distortion between two unaligned sequences by performing alignment with warping using MSE as the distance between them. - The two audios have to be spoken by the same speaker for it to make sense. + The two audios have to be spoken by the same speaker for it to make sense. The first one should be the gold reference. DTW takes an insane amount of RAM if you're not careful with sequence lengths """ @@ -86,12 +88,25 @@ def mcd_with_warping(path_1, path_2): return dist / len(spec_1) +@torch.inference_mode() +def soft_mcd(path_1, path_2): + """ + calculate mel cepstral distortion between two unaligned sequences by performing alignment with warping using euclidean distance between them. + + The two audios have to be spoken by the same speaker for it to make sense. The first one should be the gold reference. + """ + wave_1, sr_1 = sf.read(path_1) + wave_2, sr_2 = sf.read(path_2) + spec_1 = logmelfilterbank(audio=wave_1, sampling_rate=sr_1) + spec_2 = logmelfilterbank(audio=wave_2, sampling_rate=sr_2) + dist = SoftDTW(use_cuda=False)(torch.tensor(spec_1).unsqueeze(0), torch.tensor(spec_2).unsqueeze(0)) + return dist / len(spec_1) + + def dtw(x, y, dist, warp=1): """ https://github.com/pierre-rouanet/dtw/blob/master/dtw/dtw.py """ - assert len(x) - assert len(y) if ndim(x) == 1: x = x.reshape(-1, 1) if ndim(y) == 1: diff --git a/Utility/EvaluationScripts/soft_dtw.py b/Utility/EvaluationScripts/soft_dtw.py new file mode 100644 index 00000000..afbb573d --- /dev/null +++ b/Utility/EvaluationScripts/soft_dtw.py @@ -0,0 +1,359 @@ +# MIT License +# +# Copyright (c) 2020 Mehran Maghoumi +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ---------------------------------------------------------------------------------------------------------------------- + +import math + +import numpy as np +import torch +import torch.cuda +from numba import cuda +from numba import jit +from torch.autograd import Function + + +# ---------------------------------------------------------------------------------------------------------------------- +@cuda.jit +def compute_softdtw_cuda(D, gamma, bandwidth, max_i, max_j, n_passes, R): + """ + :param seq_len: The length of the sequence (both inputs are assumed to be of the same size) + :param n_passes: 2 * seq_len - 1 (The number of anti-diagonals) + """ + # Each block processes one pair of examples + b = cuda.blockIdx.x + # We have as many threads as seq_len, because the most number of threads we need + # is equal to the number of elements on the largest anti-diagonal + tid = cuda.threadIdx.x + + # Compute I, J, the indices from [0, seq_len) + + # The row index is always the same as tid + I = tid + + inv_gamma = 1.0 / gamma + + # Go over each anti-diagonal. Only process threads that fall on the current on the anti-diagonal + for p in range(n_passes): + + # The index is actually 'p - tid' but need to force it in-bounds + J = max(0, min(p - tid, max_j - 1)) + + # For simplicity, we define i, j which start from 1 (offset from I, J) + i = I + 1 + j = J + 1 + + # Only compute if element[i, j] is on the current anti-diagonal, and also is within bounds + if I + J == p and (I < max_i and J < max_j): + # Don't compute if outside bandwidth + if not (abs(i - j) > bandwidth > 0): + r0 = -R[b, i - 1, j - 1] * inv_gamma + r1 = -R[b, i - 1, j] * inv_gamma + r2 = -R[b, i, j - 1] * inv_gamma + rmax = max(max(r0, r1), r2) + rsum = math.exp(r0 - rmax) + math.exp(r1 - rmax) + math.exp(r2 - rmax) + softmin = -gamma * (math.log(rsum) + rmax) + R[b, i, j] = D[b, i - 1, j - 1] + softmin + + # Wait for other threads in this block + cuda.syncthreads() + + +# ---------------------------------------------------------------------------------------------------------------------- +@cuda.jit +def compute_softdtw_backward_cuda(D, R, inv_gamma, bandwidth, max_i, max_j, n_passes, E): + k = cuda.blockIdx.x + tid = cuda.threadIdx.x + + # Indexing logic is the same as above, however, the anti-diagonal needs to + # progress backwards + I = tid + + for p in range(n_passes): + # Reverse the order to make the loop go backward + rev_p = n_passes - p - 1 + + # convert tid to I, J, then i, j + J = max(0, min(rev_p - tid, max_j - 1)) + + i = I + 1 + j = J + 1 + + # Only compute if element[i, j] is on the current anti-diagonal, and also is within bounds + if I + J == rev_p and (I < max_i and J < max_j): + + if math.isinf(R[k, i, j]): + R[k, i, j] = -math.inf + + # Don't compute if outside bandwidth + if not (abs(i - j) > bandwidth > 0): + a = math.exp((R[k, i + 1, j] - R[k, i, j] - D[k, i + 1, j]) * inv_gamma) + b = math.exp((R[k, i, j + 1] - R[k, i, j] - D[k, i, j + 1]) * inv_gamma) + c = math.exp((R[k, i + 1, j + 1] - R[k, i, j] - D[k, i + 1, j + 1]) * inv_gamma) + E[k, i, j] = E[k, i + 1, j] * a + E[k, i, j + 1] * b + E[k, i + 1, j + 1] * c + + # Wait for other threads in this block + cuda.syncthreads() + + +# ---------------------------------------------------------------------------------------------------------------------- +class _SoftDTWCUDA(Function): + """ + CUDA implementation is inspired by the diagonal one proposed in https://ieeexplore.ieee.org/document/8400444: + "Developing a pattern discovery method in time series data and its GPU acceleration" + """ + + @staticmethod + def forward(ctx, D, gamma, bandwidth): + dev = D.device + dtype = D.dtype + gamma = torch.cuda.FloatTensor([gamma]) + bandwidth = torch.cuda.FloatTensor([bandwidth]) + + B = D.shape[0] + N = D.shape[1] + M = D.shape[2] + threads_per_block = max(N, M) + n_passes = 2 * threads_per_block - 1 + + # Prepare the output array + R = torch.ones((B, N + 2, M + 2), device=dev, dtype=dtype) * math.inf + R[:, 0, 0] = 0 + + # Run the CUDA kernel. + # Set CUDA's grid size to be equal to the batch size (every CUDA block processes one sample pair) + # Set the CUDA block size to be equal to the length of the longer sequence (equal to the size of the largest diagonal) + compute_softdtw_cuda[B, threads_per_block](cuda.as_cuda_array(D.detach()), + gamma.item(), bandwidth.item(), N, M, n_passes, + cuda.as_cuda_array(R)) + ctx.save_for_backward(D, R.clone(), gamma, bandwidth) + return R[:, -2, -2] + + @staticmethod + def backward(ctx, grad_output): + dev = grad_output.device + dtype = grad_output.dtype + D, R, gamma, bandwidth = ctx.saved_tensors + + B = D.shape[0] + N = D.shape[1] + M = D.shape[2] + threads_per_block = max(N, M) + n_passes = 2 * threads_per_block - 1 + + D_ = torch.zeros((B, N + 2, M + 2), dtype=dtype, device=dev) + D_[:, 1:N + 1, 1:M + 1] = D + + R[:, :, -1] = -math.inf + R[:, -1, :] = -math.inf + R[:, -1, -1] = R[:, -2, -2] + + E = torch.zeros((B, N + 2, M + 2), dtype=dtype, device=dev) + E[:, -1, -1] = 1 + + # Grid and block sizes are set same as done above for the forward() call + compute_softdtw_backward_cuda[B, threads_per_block](cuda.as_cuda_array(D_), + cuda.as_cuda_array(R), + 1.0 / gamma.item(), bandwidth.item(), N, M, n_passes, + cuda.as_cuda_array(E)) + E = E[:, 1:N + 1, 1:M + 1] + return grad_output.view(-1, 1, 1).expand_as(E) * E, None, None + + +# ---------------------------------------------------------------------------------------------------------------------- +# +# The following is the CPU implementation based on https://github.com/Sleepwalking/pytorch-softdtw +# Credit goes to Kanru Hua. +# I've added support for batching and pruning. +# +# ---------------------------------------------------------------------------------------------------------------------- +@jit(nopython=True) +def compute_softdtw(D, gamma, bandwidth): + B = D.shape[0] + N = D.shape[1] + M = D.shape[2] + R = np.ones((B, N + 2, M + 2)) * np.inf + R[:, 0, 0] = 0 + for b in range(B): + for j in range(1, M + 1): + for i in range(1, N + 1): + + # Check the pruning condition + if 0 < bandwidth < np.abs(i - j): + continue + + r0 = -R[b, i - 1, j - 1] / gamma + r1 = -R[b, i - 1, j] / gamma + r2 = -R[b, i, j - 1] / gamma + rmax = max(max(r0, r1), r2) + rsum = np.exp(r0 - rmax) + np.exp(r1 - rmax) + np.exp(r2 - rmax) + softmin = - gamma * (np.log(rsum) + rmax) + R[b, i, j] = D[b, i - 1, j - 1] + softmin + return R + + +# ---------------------------------------------------------------------------------------------------------------------- +@jit(nopython=True) +def compute_softdtw_backward(D_, R, gamma, bandwidth): + B = D_.shape[0] + N = D_.shape[1] + M = D_.shape[2] + D = np.zeros((B, N + 2, M + 2)) + E = np.zeros((B, N + 2, M + 2)) + D[:, 1:N + 1, 1:M + 1] = D_ + E[:, -1, -1] = 1 + R[:, :, -1] = -np.inf + R[:, -1, :] = -np.inf + R[:, -1, -1] = R[:, -2, -2] + for k in range(B): + for j in range(M, 0, -1): + for i in range(N, 0, -1): + + if np.isinf(R[k, i, j]): + R[k, i, j] = -np.inf + + # Check the pruning condition + if 0 < bandwidth < np.abs(i - j): + continue + + a0 = (R[k, i + 1, j] - R[k, i, j] - D[k, i + 1, j]) / gamma + b0 = (R[k, i, j + 1] - R[k, i, j] - D[k, i, j + 1]) / gamma + c0 = (R[k, i + 1, j + 1] - R[k, i, j] - D[k, i + 1, j + 1]) / gamma + a = np.exp(a0) + b = np.exp(b0) + c = np.exp(c0) + E[k, i, j] = E[k, i + 1, j] * a + E[k, i, j + 1] * b + E[k, i + 1, j + 1] * c + return E[:, 1:N + 1, 1:M + 1] + + +# ---------------------------------------------------------------------------------------------------------------------- +class _SoftDTW(Function): + """ + CPU implementation based on https://github.com/Sleepwalking/pytorch-softdtw + """ + + @staticmethod + def forward(ctx, D, gamma, bandwidth): + dev = D.device + dtype = D.dtype + gamma = torch.Tensor([gamma]).to(dev).type(dtype) # dtype fixed + bandwidth = torch.Tensor([bandwidth]).to(dev).type(dtype) + D_ = D.detach().cpu().numpy() + g_ = gamma.item() + b_ = bandwidth.item() + R = torch.Tensor(compute_softdtw(D_, g_, b_)).to(dev).type(dtype) + ctx.save_for_backward(D, R, gamma, bandwidth) + return R[:, -2, -2] + + @staticmethod + def backward(ctx, grad_output): + dev = grad_output.device + dtype = grad_output.dtype + D, R, gamma, bandwidth = ctx.saved_tensors + D_ = D.detach().cpu().numpy() + R_ = R.detach().cpu().numpy() + g_ = gamma.item() + b_ = bandwidth.item() + E = torch.Tensor(compute_softdtw_backward(D_, R_, g_, b_)).to(dev).type(dtype) + return grad_output.view(-1, 1, 1).expand_as(E) * E, None, None + + +# ---------------------------------------------------------------------------------------------------------------------- +class SoftDTW(torch.nn.Module): + """ + The soft DTW implementation that optionally supports CUDA + """ + + def __init__(self, use_cuda, gamma=1.0, normalize=False, bandwidth=None, dist_func=None): + """ + Initializes a new instance using the supplied parameters + :param use_cuda: Flag indicating whether the CUDA implementation should be used + :param gamma: sDTW's gamma parameter + :param normalize: Flag indicating whether to perform normalization + (as discussed in https://github.com/mblondel/soft-dtw/issues/10#issuecomment-383564790) + :param bandwidth: Sakoe-Chiba bandwidth for pruning. Passing 'None' will disable pruning. + :param dist_func: Optional point-wise distance function to use. If 'None', then a default Euclidean distance function will be used. + """ + super(SoftDTW, self).__init__() + self.normalize = normalize + self.gamma = gamma + self.bandwidth = 0 if bandwidth is None else float(bandwidth) + self.use_cuda = use_cuda + + # Set the distance function + if dist_func is not None: + self.dist_func = dist_func + else: + self.dist_func = SoftDTW._euclidean_dist_func + + def _get_func_dtw(self, x, y): + """ + Checks the inputs and selects the proper implementation to use. + """ + bx, lx, dx = x.shape + by, ly, dy = y.shape + # Make sure the dimensions match + assert bx == by # Equal batch sizes + assert dx == dy # Equal feature dimensions + + use_cuda = self.use_cuda + + if use_cuda and (lx > 1024 or ly > 1024): # We should be able to spawn enough threads in CUDA + print("SoftDTW: Cannot use CUDA because the sequence length > 1024 (the maximum block size supported by CUDA)") + use_cuda = False + + # Finally, return the correct function + return _SoftDTWCUDA.apply if use_cuda else _SoftDTW.apply + + @staticmethod + def _euclidean_dist_func(x, y): + """ + Calculates the Euclidean distance between each element in x and y per timestep + """ + n = x.size(1) + m = y.size(1) + d = x.size(2) + x = x.unsqueeze(2).expand(-1, n, m, d) + y = y.unsqueeze(1).expand(-1, n, m, d) + return torch.pow(x - y, 2).sum(3) + + def forward(self, X, Y): + """ + Compute the soft-DTW value between X and Y + :param X: One batch of examples, batch_size x seq_len x dims + :param Y: The other batch of examples, batch_size x seq_len x dims + :return: The computed results + """ + + # Check the inputs and get the correct implementation + func_dtw = self._get_func_dtw(X, Y) + + if self.normalize: + # Stack everything up and run + x = torch.cat([X, X, Y]) + y = torch.cat([Y, X, Y]) + D = self.dist_func(x, y) + out = func_dtw(D, self.gamma, self.bandwidth) + out_xy, out_xx, out_yy = torch.split(out, X.shape[0]) + return out_xy - 1 / 2 * (out_xx + out_yy) + else: + D_xy = self.dist_func(X, Y) + return func_dtw(D_xy, self.gamma, self.bandwidth) diff --git a/run_evaluation.py b/run_evaluation.py index d219af83..e0b65a5d 100644 --- a/run_evaluation.py +++ b/run_evaluation.py @@ -3,7 +3,7 @@ from tqdm import tqdm from Utility.EvaluationScripts.audio_vs_audio import ffe -from Utility.EvaluationScripts.audio_vs_audio import mcd_with_warping +from Utility.EvaluationScripts.audio_vs_audio import soft_mcd mcd_lyric_cloned = list() mcd_lyric_uncond = list() @@ -17,14 +17,14 @@ for file in tqdm(os.listdir("audios/evaluation/human")): if file.endswith(".wav"): - mcd_lyric_cloned.append(mcd_with_warping(f"audios/evaluation/human/{file}", - f"audios/evaluation/poetry_cloned/{file.split('.')[0]}_poetic_cloned.wav")) - mcd_lyric_uncond.append(mcd_with_warping(f"audios/evaluation/human/{file}", - f"audios/evaluation/poetry_unconditional/{file.split('.')[0]}_poetic_uncond.wav")) - mcd_prose_cloned.append(mcd_with_warping(f"audios/evaluation/human/{file}", - f"audios/evaluation/prosa_cloned/{file.split('.')[0]}_prosa_cloned.wav")) - mcd_prose_uncond.append(mcd_with_warping(f"audios/evaluation/human/{file}", - f"audios/evaluation/prosa_unconditional/{file.split('.')[0]}_prosa_uncond.wav")) + mcd_lyric_cloned.append(soft_mcd(f"audios/evaluation/human/{file}", + f"audios/evaluation/poetry_cloned/{file.split('.')[0]}_poetic_cloned.wav")) + mcd_lyric_uncond.append(soft_mcd(f"audios/evaluation/human/{file}", + f"audios/evaluation/poetry_unconditional/{file.split('.')[0]}_poetic_uncond.wav")) + mcd_prose_cloned.append(soft_mcd(f"audios/evaluation/human/{file}", + f"audios/evaluation/prosa_cloned/{file.split('.')[0]}_prosa_cloned.wav")) + mcd_prose_uncond.append(soft_mcd(f"audios/evaluation/human/{file}", + f"audios/evaluation/prosa_unconditional/{file.split('.')[0]}_prosa_uncond.wav")) ffe_lyric_cloned.append(ffe(f"audios/evaluation/human/{file}", f"audios/evaluation/poetry_cloned/{file.split('.')[0]}_poetic_cloned.wav")) ffe_lyric_uncond.append(ffe(f"audios/evaluation/human/{file}", f"audios/evaluation/poetry_unconditional/{file.split('.')[0]}_poetic_uncond.wav")) From 5b51727dc7c0c604ccbfe6fa89a3b4b810a1e6d4 Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Mon, 14 Mar 2022 22:39:37 +0100 Subject: [PATCH 27/33] fix major impact of silences on DTW results --- Utility/EvaluationScripts/audio_vs_audio.py | 21 ++++++++++---- run_evaluation.py | 32 ++++++++++++--------- 2 files changed, 33 insertions(+), 20 deletions(-) diff --git a/Utility/EvaluationScripts/audio_vs_audio.py b/Utility/EvaluationScripts/audio_vs_audio.py index e67d0e6a..848735ca 100644 --- a/Utility/EvaluationScripts/audio_vs_audio.py +++ b/Utility/EvaluationScripts/audio_vs_audio.py @@ -85,7 +85,9 @@ def mcd_with_warping(path_1, path_2): spec_1 = logmelfilterbank(audio=wave_1, sampling_rate=sr_1) spec_2 = logmelfilterbank(audio=wave_2, sampling_rate=sr_2) dist, _, _ = dtw(spec_1, spec_2, mean_squared_error) - return dist / len(spec_1) + d = dist / len(spec_1) + print(d) + return d @torch.inference_mode() @@ -97,10 +99,17 @@ def soft_mcd(path_1, path_2): """ wave_1, sr_1 = sf.read(path_1) wave_2, sr_2 = sf.read(path_2) - spec_1 = logmelfilterbank(audio=wave_1, sampling_rate=sr_1) - spec_2 = logmelfilterbank(audio=wave_2, sampling_rate=sr_2) - dist = SoftDTW(use_cuda=False)(torch.tensor(spec_1).unsqueeze(0), torch.tensor(spec_2).unsqueeze(0)) - return dist / len(spec_1) + + ap1 = AudioPreprocessor(cut_silence=True, input_sr=sr_1, output_sr=16000) + ap2 = AudioPreprocessor(cut_silence=True, input_sr=sr_2, output_sr=16000) + + spec_1 = logmelfilterbank(audio=ap1.audio_to_wave_tensor(wave_1, normalize=True).squeeze().numpy(), sampling_rate=16000) + spec_2 = logmelfilterbank(audio=ap2.audio_to_wave_tensor(wave_2, normalize=True).squeeze().numpy(), sampling_rate=16000) + + dist = SoftDTW(use_cuda=False, gamma=0.0001)(torch.tensor(spec_1).unsqueeze(0), torch.tensor(spec_2).unsqueeze(0)) / len(spec_2) + print(dist) + + return dist def dtw(x, y, dist, warp=1): @@ -136,7 +145,7 @@ def logmelfilterbank(audio, sampling_rate, fmin=40, fmax=8000, eps=1e-10): # get mel basis fmin = 0 if fmin is None else fmin fmax = sampling_rate / 2 if fmax is None else fmax - mel_basis = librosa.filters.mel(sampling_rate, 1024, 80, fmin, fmax) + mel_basis = librosa.filters.mel(sampling_rate, 1024, 10, fmin, fmax) # apply log and return return numpy.log10(numpy.maximum(eps, numpy.dot(spc, mel_basis.T))) diff --git a/run_evaluation.py b/run_evaluation.py index e0b65a5d..27ae3c3e 100644 --- a/run_evaluation.py +++ b/run_evaluation.py @@ -15,21 +15,25 @@ ffe_prose_cloned = list() ffe_prose_uncond = list() -for file in tqdm(os.listdir("audios/evaluation/human")): +for file in tqdm(os.listdir("audios/evaluation_no_noisereduce/human")): if file.endswith(".wav"): - mcd_lyric_cloned.append(soft_mcd(f"audios/evaluation/human/{file}", - f"audios/evaluation/poetry_cloned/{file.split('.')[0]}_poetic_cloned.wav")) - mcd_lyric_uncond.append(soft_mcd(f"audios/evaluation/human/{file}", - f"audios/evaluation/poetry_unconditional/{file.split('.')[0]}_poetic_uncond.wav")) - mcd_prose_cloned.append(soft_mcd(f"audios/evaluation/human/{file}", - f"audios/evaluation/prosa_cloned/{file.split('.')[0]}_prosa_cloned.wav")) - mcd_prose_uncond.append(soft_mcd(f"audios/evaluation/human/{file}", - f"audios/evaluation/prosa_unconditional/{file.split('.')[0]}_prosa_uncond.wav")) - - ffe_lyric_cloned.append(ffe(f"audios/evaluation/human/{file}", f"audios/evaluation/poetry_cloned/{file.split('.')[0]}_poetic_cloned.wav")) - ffe_lyric_uncond.append(ffe(f"audios/evaluation/human/{file}", f"audios/evaluation/poetry_unconditional/{file.split('.')[0]}_poetic_uncond.wav")) - ffe_prose_cloned.append(ffe(f"audios/evaluation/human/{file}", f"audios/evaluation/prosa_cloned/{file.split('.')[0]}_prosa_cloned.wav")) - ffe_prose_uncond.append(ffe(f"audios/evaluation/human/{file}", f"audios/evaluation/prosa_unconditional/{file.split('.')[0]}_prosa_uncond.wav")) + mcd_lyric_cloned.append(soft_mcd(f"audios/evaluation_no_noisereduce/human/{file}", + f"audios/evaluation_no_noisereduce/poetry_cloned/{file.split('.')[0]}_poetic_cloned.wav")) + mcd_lyric_uncond.append(soft_mcd(f"audios/evaluation_no_noisereduce/human/{file}", + f"audios/evaluation_no_noisereduce/poetry_unconditional/{file.split('.')[0]}_poetic_uncond.wav")) + mcd_prose_cloned.append(soft_mcd(f"audios/evaluation_no_noisereduce/human/{file}", + f"audios/evaluation_no_noisereduce/prosa_cloned/{file.split('.')[0]}_prosa_cloned.wav")) + mcd_prose_uncond.append(soft_mcd(f"audios/evaluation_no_noisereduce/human/{file}", + f"audios/evaluation_no_noisereduce/prosa_unconditional/{file.split('.')[0]}_prosa_uncond.wav")) + + ffe_lyric_cloned.append( + ffe(f"audios/evaluation_no_noisereduce/human/{file}", f"audios/evaluation_no_noisereduce/poetry_cloned/{file.split('.')[0]}_poetic_cloned.wav")) + ffe_lyric_uncond.append(ffe(f"audios/evaluation_no_noisereduce/human/{file}", + f"audios/evaluation_no_noisereduce/poetry_unconditional/{file.split('.')[0]}_poetic_uncond.wav")) + ffe_prose_cloned.append( + ffe(f"audios/evaluation_no_noisereduce/human/{file}", f"audios/evaluation_no_noisereduce/prosa_cloned/{file.split('.')[0]}_prosa_cloned.wav")) + ffe_prose_uncond.append(ffe(f"audios/evaluation_no_noisereduce/human/{file}", + f"audios/evaluation_no_noisereduce/prosa_unconditional/{file.split('.')[0]}_prosa_uncond.wav")) print((1 / len(mcd_lyric_cloned)) * sum(mcd_lyric_cloned)) print((1 / len(mcd_lyric_uncond)) * sum(mcd_lyric_uncond)) From d14628925052bae009c9f1f3399267dac6356d0b Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Mon, 14 Mar 2022 22:40:11 +0100 Subject: [PATCH 28/33] fix debug statement still intact --- Utility/EvaluationScripts/audio_vs_audio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Utility/EvaluationScripts/audio_vs_audio.py b/Utility/EvaluationScripts/audio_vs_audio.py index 848735ca..9618a8db 100644 --- a/Utility/EvaluationScripts/audio_vs_audio.py +++ b/Utility/EvaluationScripts/audio_vs_audio.py @@ -145,7 +145,7 @@ def logmelfilterbank(audio, sampling_rate, fmin=40, fmax=8000, eps=1e-10): # get mel basis fmin = 0 if fmin is None else fmin fmax = sampling_rate / 2 if fmax is None else fmax - mel_basis = librosa.filters.mel(sampling_rate, 1024, 10, fmin, fmax) + mel_basis = librosa.filters.mel(sampling_rate, 1024, 80, fmin, fmax) # apply log and return return numpy.log10(numpy.maximum(eps, numpy.dot(spc, mel_basis.T))) From 4ec7f12b11221f4844c1b4645a8e98d0383d792d Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Tue, 15 Mar 2022 10:57:51 +0100 Subject: [PATCH 29/33] fix slicing in custom dataset --- .../FastSpeech2/meta_train_loop.py | 8 +------- .../TrainingPipelines/FastSpeech2_MetaCheckpoint.py | 13 ++++++++++++- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py index bd5925c6..40707ba8 100644 --- a/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py +++ b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py @@ -50,13 +50,7 @@ def train_loop(net, train_iters.append(iter(train_loaders[-1])) default_embeddings = {"en": None, "de": None, "el": None, "es": None, "fi": None, "ru": None, "hu": None, "nl": None, "fr": None} for index, lang in enumerate(["en", "de", "el", "es", "fi", "ru", "hu", "nl", "fr"]): - default_embedding = None - for datapoint in datasets[index][:20]: # default embedding for plotting is the average embedding of the first 20 datapoints for each language - if default_embedding is None: - default_embedding = datapoint[7].squeeze() - else: - default_embedding = default_embedding + datapoint[7].squeeze() - default_embeddings[lang] = (default_embedding / 20).to(device) + default_embeddings[lang] = datasets[index][0][7].squeeze() optimizer = torch.optim.RAdam(net.parameters(), lr=lr, eps=1.0e-06, weight_decay=0.0) grad_scaler = GradScaler() scheduler = WarmupScheduler(optimizer, warmup_steps=warmup_steps) diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py index d7c3139b..f29ca28a 100644 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py +++ b/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py @@ -162,7 +162,18 @@ def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, remove_faulty_sa if remove_faulty_samples: find_and_remove_faulty_samples(net=FastSpeech2(lang_embs=100), - datasets=datasets, + datasets=english_datasets + + german_datasets + + greek_datasets + + spanish_datasets + + finnish_datasets + + russian_datasets + + hungarian_datasets + + dutch_datasets + + french_datasets + + portuguese_datasets + + polish_datasets + + italian_datasets, device=torch.device("cuda"), path_to_checkpoint=resume_checkpoint) From 23df1e217bf36bc59b6da2e1eb6a590f2b01473f Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Tue, 15 Mar 2022 17:11:34 +0100 Subject: [PATCH 30/33] fix device of default embedding --- .../Text_to_Spectrogram/FastSpeech2/meta_train_loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py index 40707ba8..7256a647 100644 --- a/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py +++ b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py @@ -50,7 +50,7 @@ def train_loop(net, train_iters.append(iter(train_loaders[-1])) default_embeddings = {"en": None, "de": None, "el": None, "es": None, "fi": None, "ru": None, "hu": None, "nl": None, "fr": None} for index, lang in enumerate(["en", "de", "el", "es", "fi", "ru", "hu", "nl", "fr"]): - default_embeddings[lang] = datasets[index][0][7].squeeze() + default_embeddings[lang] = datasets[index][0][7].squeeze().to(device) optimizer = torch.optim.RAdam(net.parameters(), lr=lr, eps=1.0e-06, weight_decay=0.0) grad_scaler = GradScaler() scheduler = WarmupScheduler(optimizer, warmup_steps=warmup_steps) From 1a818780157a2215ec00b9ae004370cdbdf22c04 Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Sat, 19 Mar 2022 00:32:46 +0100 Subject: [PATCH 31/33] use a different language ID for english, since 0 might be overloaded somehow --- InferenceInterfaces/InferenceFastSpeech2.py | 2 +- Preprocessing/ArticulatoryCombinedTextFrontend.py | 6 +++--- .../TrainingPipelines/FastSpeech2_MetaCheckpoint.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/InferenceInterfaces/InferenceFastSpeech2.py b/InferenceInterfaces/InferenceFastSpeech2.py index 1b4e00c2..46ebc026 100644 --- a/InferenceInterfaces/InferenceFastSpeech2.py +++ b/InferenceInterfaces/InferenceFastSpeech2.py @@ -17,7 +17,7 @@ class InferenceFastSpeech2(torch.nn.Module): - def __init__(self, device="cpu", model_name="Meta", language="en", noise_reduce=True): + def __init__(self, device="cpu", model_name="Meta", language="en", noise_reduce=False): super().__init__() self.device = device self.text2phone = ArticulatoryCombinedTextFrontend(language=language, add_silence_to_end=True) diff --git a/Preprocessing/ArticulatoryCombinedTextFrontend.py b/Preprocessing/ArticulatoryCombinedTextFrontend.py index 5d5cd0e7..bcd6fd78 100644 --- a/Preprocessing/ArticulatoryCombinedTextFrontend.py +++ b/Preprocessing/ArticulatoryCombinedTextFrontend.py @@ -288,9 +288,7 @@ def english_text_expansion(text): def get_language_id(language): - if language == "en": - return torch.LongTensor([0]) - elif language == "de": + if language == "de": return torch.LongTensor([1]) elif language == "el": return torch.LongTensor([2]) @@ -312,6 +310,8 @@ def get_language_id(language): return torch.LongTensor([10]) elif language == "it": return torch.LongTensor([11]) + elif language == "en": + return torch.LongTensor([12]) if __name__ == '__main__': diff --git a/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py b/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py index f29ca28a..b59353ef 100644 --- a/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py +++ b/TrainingInterfaces/TrainingPipelines/FastSpeech2_MetaCheckpoint.py @@ -182,7 +182,7 @@ def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, remove_faulty_sa datasets=datasets, batch_size=6, save_directory=meta_save_dir, - steps=200000, + steps=300000, steps_per_checkpoint=1000, lr=0.001, path_to_checkpoint=resume_checkpoint, From a426c6f3f6555ecd5864afede9ef13d04da4c6ab Mon Sep 17 00:00:00 2001 From: Flux9665 Date: Sat, 19 Mar 2022 19:25:19 +0100 Subject: [PATCH 32/33] fix incredibly studpid mistake --- InferenceInterfaces/InferenceFastSpeech2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/InferenceInterfaces/InferenceFastSpeech2.py b/InferenceInterfaces/InferenceFastSpeech2.py index 46ebc026..649612d1 100644 --- a/InferenceInterfaces/InferenceFastSpeech2.py +++ b/InferenceInterfaces/InferenceFastSpeech2.py @@ -66,7 +66,8 @@ def forward(self, text, view=False, durations=None, pitch=None, energy=None, inp utterance_embedding=self.default_utterance_embedding, durations=durations, pitch=pitch, - energy=energy) + energy=energy, + lang_id=self.lang_id) mel = mel.transpose(0, 1) wave = self.mel2wav(mel) if view: From be7bb2a80c714e564e4e61ad1246449bb4f8e8a2 Mon Sep 17 00:00:00 2001 From: Flux9665 Date: Sat, 19 Mar 2022 19:46:40 +0100 Subject: [PATCH 33/33] fix not all models having a lang id --- InferenceInterfaces/InferenceFastSpeech2.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/InferenceInterfaces/InferenceFastSpeech2.py b/InferenceInterfaces/InferenceFastSpeech2.py index 649612d1..bfe74849 100644 --- a/InferenceInterfaces/InferenceFastSpeech2.py +++ b/InferenceInterfaces/InferenceFastSpeech2.py @@ -22,10 +22,12 @@ def __init__(self, device="cpu", model_name="Meta", language="en", noise_reduce= self.device = device self.text2phone = ArticulatoryCombinedTextFrontend(language=language, add_silence_to_end=True) checkpoint = torch.load(os.path.join("Models", f"FastSpeech2_{model_name}", "best.pt"), map_location='cpu') + self.use_lang_id = True try: self.phone2mel = FastSpeech2(weights=checkpoint["model"]).to(torch.device(device)) # multi speaker multi language except RuntimeError: try: + self.use_lang_id = False self.phone2mel = FastSpeech2(weights=checkpoint["model"], lang_embs=None).to(torch.device(device)) # multi speaker single language except RuntimeError: self.phone2mel = FastSpeech2(weights=checkpoint["model"], lang_embs=None, utt_embed_dim=None).to(torch.device(device)) # single speaker @@ -33,7 +35,10 @@ def __init__(self, device="cpu", model_name="Meta", language="en", noise_reduce= self.default_utterance_embedding = checkpoint["default_emb"].to(self.device) self.phone2mel.eval() self.mel2wav.eval() - self.lang_id = get_language_id(language) + if self.use_lang_id: + self.lang_id = get_language_id(language) + else: + self.lang_id = None self.to(torch.device(device)) self.noise_reduce = noise_reduce if self.noise_reduce: @@ -56,7 +61,10 @@ def set_language(self, lang_id): The id parameter actually refers to the shorthand. This has become ambiguous with the introduction of the actual language IDs """ self.text2phone = ArticulatoryCombinedTextFrontend(language=lang_id, add_silence_to_end=True) - self.lang_id = get_language_id(lang_id).to(self.device) + if self.use_lang_id: + self.lang_id = get_language_id(lang_id).to(self.device) + else: + self.lang_id = None def forward(self, text, view=False, durations=None, pitch=None, energy=None, input_is_phones=False): with torch.inference_mode():