Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add speaker encoder recipe #1912

Merged
merged 1 commit into from
Aug 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions TTS/encoder/models/base_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def load_checkpoint(
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
try:
self.load_state_dict(state["model"])
print(" > Model fully restored. ")
except (KeyError, RuntimeError) as error:
# If eval raise the error
if eval:
Expand Down
139 changes: 139 additions & 0 deletions recipes/vctk/resnet_speaker_encoder/train_encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import os

from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig

# from TTS.encoder.configs.emotion_encoder_config import EmotionEncoderConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig

CURRENT_PATH = os.getcwd()
# change the root path to the TTS root path
os.chdir("../../../")

### Definitions ###
# dataset
VCTK_PATH = "/raid/datasets/VCTK_NEW_16khz_removed_silence_silero_vad/" # download: https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zipdddddddddd
RIR_SIMULATED_PATH = "/raid/datasets/DA/RIRS_NOISES/simulated_rirs/" # download: https://www.openslr.org/17/
MUSAN_PATH = "/raid/datasets/DA/musan/" # download: https://www.openslr.org/17/

# training
OUTPUT_PATH = os.path.join(
CURRENT_PATH, "resnet_speaker_encoder_training_output/"
) # path to save the train logs and checkpoint
CONFIG_OUT_PATH = os.path.join(OUTPUT_PATH, "config_se.json")
RESTORE_PATH = None # Checkpoint to use for transfer learning if None ignore

# instance the config
# to speaker encoder
config = SpeakerEncoderConfig()
# to emotion encoder
# config = EmotionEncoderConfig()


#### DATASET CONFIG ####
# The formatter need to return the key "speaker_name" for the speaker encoder and the "emotion_name" for the emotion encoder
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", language="en-us", path=VCTK_PATH)

# add the dataset to the config
config.datasets = [dataset_config]


#### TRAINING CONFIG ####
# The encoder data loader balancer the dataset item equally to guarantee better training and to attend the losses requirements
# It have two parameters to control the final batch size the number total of speaker used in each batch and the number of samples for each speaker

# number total of speaker in batch in training
config.num_classes_in_batch = 100
# number of utterance per class/speaker in the batch in training
config.num_utter_per_class = 4
# final batch size = config.num_classes_in_batch * config.num_utter_per_class

# number total of speaker in batch in evaluation
config.eval_num_classes_in_batch = 100
# number of utterance per class/speaker in the batch in evaluation
config.eval_num_utter_per_class = 4

# number of data loader workers
config.num_loader_workers = 8
config.num_val_loader_workers = 8

# number of epochs
config.epochs = 10000
# loss to be used in training
config.loss = "softmaxproto"

# run eval
config.run_eval = False

# output path for the checkpoints
config.output_path = OUTPUT_PATH

# Save local checkpoint every save_step steps
config.save_step = 2000

### Model Config ###
config.model_params = {
"model_name": "resnet", # supported "lstm" and "resnet"
"input_dim": 64,
"use_torch_spec": True,
"log_input": True,
"proj_dim": 512, # embedding dim
}

### Audio Config ###
# To fast train the model divides the audio in small parts. it parameter defines the length in seconds of these "parts"
config.voice_len = 2.0
# all others configs
config.audio = {
"fft_size": 512,
"win_length": 400,
"hop_length": 160,
"frame_shift_ms": None,
"frame_length_ms": None,
"stft_pad_mode": "reflect",
"sample_rate": 16000,
"resample": False,
"preemphasis": 0.97,
"ref_level_db": 20,
"do_sound_norm": False,
"do_trim_silence": False,
"trim_db": 60,
"power": 1.5,
"griffin_lim_iters": 60,
"num_mels": 64,
"mel_fmin": 0.0,
"mel_fmax": 8000.0,
"spec_gain": 20,
"signal_norm": False,
"min_level_db": -100,
"symmetric_norm": False,
"max_norm": 4.0,
"clip_norm": False,
"stats_path": None,
"do_rms_norm": True,
"db_level": -27.0,
}


### Augmentation Config ###
config.audio_augmentation = {
# additive noise and room impulse response (RIR) simulation similar to: https://arxiv.org/pdf/2009.14153.pdf
"p": 0.5, # probability to the use of one of the augmentation - 0 means disabled
"rir": {"rir_path": RIR_SIMULATED_PATH, "conv_mode": "full"}, # download: https://www.openslr.org/17/
"additive": {
"sounds_path": MUSAN_PATH,
"speech": {"min_snr_in_db": 13, "max_snr_in_db": 20, "min_num_noises": 1, "max_num_noises": 1},
"noise": {"min_snr_in_db": 0, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1},
"music": {"min_snr_in_db": 5, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1},
},
"gaussian": {"p": 0.7, "min_amplitude": 0.0, "max_amplitude": 1e-05},
}

config.save_json(CONFIG_OUT_PATH)

print(CONFIG_OUT_PATH)
if RESTORE_PATH is not None:
command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH} --restore_path {RESTORE_PATH}"
else:
command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH}"

os.system(command)