From 096b35f6396d063205318fbaef10f08b9f699832 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 26 Aug 2022 11:19:03 -0300 Subject: [PATCH] Add VCTK speaker encoder recipe (#1912) --- TTS/encoder/models/base_encoder.py | 1 + .../resnet_speaker_encoder/train_encoder.py | 139 ++++++++++++++++++ 2 files changed, 140 insertions(+) create mode 100644 recipes/vctk/resnet_speaker_encoder/train_encoder.py diff --git a/TTS/encoder/models/base_encoder.py b/TTS/encoder/models/base_encoder.py index ac7d7dd5a6..f741a2deae 100644 --- a/TTS/encoder/models/base_encoder.py +++ b/TTS/encoder/models/base_encoder.py @@ -112,6 +112,7 @@ def load_checkpoint( state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) try: self.load_state_dict(state["model"]) + print(" > Model fully restored. ") except (KeyError, RuntimeError) as error: # If eval raise the error if eval: diff --git a/recipes/vctk/resnet_speaker_encoder/train_encoder.py b/recipes/vctk/resnet_speaker_encoder/train_encoder.py new file mode 100644 index 0000000000..a96c631f74 --- /dev/null +++ b/recipes/vctk/resnet_speaker_encoder/train_encoder.py @@ -0,0 +1,139 @@ +import os + +from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig + +# from TTS.encoder.configs.emotion_encoder_config import EmotionEncoderConfig +from TTS.tts.configs.shared_configs import BaseDatasetConfig + +CURRENT_PATH = os.getcwd() +# change the root path to the TTS root path +os.chdir("../../../") + +### Definitions ### +# dataset +VCTK_PATH = "/raid/datasets/VCTK_NEW_16khz_removed_silence_silero_vad/" # download: https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zipdddddddddd +RIR_SIMULATED_PATH = "/raid/datasets/DA/RIRS_NOISES/simulated_rirs/" # download: https://www.openslr.org/17/ +MUSAN_PATH = "/raid/datasets/DA/musan/" # download: https://www.openslr.org/17/ + +# training +OUTPUT_PATH = os.path.join( + CURRENT_PATH, "resnet_speaker_encoder_training_output/" +) # path to save the train logs and checkpoint +CONFIG_OUT_PATH = os.path.join(OUTPUT_PATH, "config_se.json") +RESTORE_PATH = None # Checkpoint to use for transfer learning if None ignore + +# instance the config +# to speaker encoder +config = SpeakerEncoderConfig() +# to emotion encoder +# config = EmotionEncoderConfig() + + +#### DATASET CONFIG #### +# The formatter need to return the key "speaker_name" for the speaker encoder and the "emotion_name" for the emotion encoder +dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", language="en-us", path=VCTK_PATH) + +# add the dataset to the config +config.datasets = [dataset_config] + + +#### TRAINING CONFIG #### +# The encoder data loader balancer the dataset item equally to guarantee better training and to attend the losses requirements +# It have two parameters to control the final batch size the number total of speaker used in each batch and the number of samples for each speaker + +# number total of speaker in batch in training +config.num_classes_in_batch = 100 +# number of utterance per class/speaker in the batch in training +config.num_utter_per_class = 4 +# final batch size = config.num_classes_in_batch * config.num_utter_per_class + +# number total of speaker in batch in evaluation +config.eval_num_classes_in_batch = 100 +# number of utterance per class/speaker in the batch in evaluation +config.eval_num_utter_per_class = 4 + +# number of data loader workers +config.num_loader_workers = 8 +config.num_val_loader_workers = 8 + +# number of epochs +config.epochs = 10000 +# loss to be used in training +config.loss = "softmaxproto" + +# run eval +config.run_eval = False + +# output path for the checkpoints +config.output_path = OUTPUT_PATH + +# Save local checkpoint every save_step steps +config.save_step = 2000 + +### Model Config ### +config.model_params = { + "model_name": "resnet", # supported "lstm" and "resnet" + "input_dim": 64, + "use_torch_spec": True, + "log_input": True, + "proj_dim": 512, # embedding dim +} + +### Audio Config ### +# To fast train the model divides the audio in small parts. it parameter defines the length in seconds of these "parts" +config.voice_len = 2.0 +# all others configs +config.audio = { + "fft_size": 512, + "win_length": 400, + "hop_length": 160, + "frame_shift_ms": None, + "frame_length_ms": None, + "stft_pad_mode": "reflect", + "sample_rate": 16000, + "resample": False, + "preemphasis": 0.97, + "ref_level_db": 20, + "do_sound_norm": False, + "do_trim_silence": False, + "trim_db": 60, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 64, + "mel_fmin": 0.0, + "mel_fmax": 8000.0, + "spec_gain": 20, + "signal_norm": False, + "min_level_db": -100, + "symmetric_norm": False, + "max_norm": 4.0, + "clip_norm": False, + "stats_path": None, + "do_rms_norm": True, + "db_level": -27.0, +} + + +### Augmentation Config ### +config.audio_augmentation = { + # additive noise and room impulse response (RIR) simulation similar to: https://arxiv.org/pdf/2009.14153.pdf + "p": 0.5, # probability to the use of one of the augmentation - 0 means disabled + "rir": {"rir_path": RIR_SIMULATED_PATH, "conv_mode": "full"}, # download: https://www.openslr.org/17/ + "additive": { + "sounds_path": MUSAN_PATH, + "speech": {"min_snr_in_db": 13, "max_snr_in_db": 20, "min_num_noises": 1, "max_num_noises": 1}, + "noise": {"min_snr_in_db": 0, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1}, + "music": {"min_snr_in_db": 5, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1}, + }, + "gaussian": {"p": 0.7, "min_amplitude": 0.0, "max_amplitude": 1e-05}, +} + +config.save_json(CONFIG_OUT_PATH) + +print(CONFIG_OUT_PATH) +if RESTORE_PATH is not None: + command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH} --restore_path {RESTORE_PATH}" +else: + command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH}" + +os.system(command)