-
Notifications
You must be signed in to change notification settings - Fork 4.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add VCTK speaker encoder recipe (#1912)
- Loading branch information
Showing
2 changed files
with
140 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
import os | ||
|
||
from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig | ||
|
||
# from TTS.encoder.configs.emotion_encoder_config import EmotionEncoderConfig | ||
from TTS.tts.configs.shared_configs import BaseDatasetConfig | ||
|
||
CURRENT_PATH = os.getcwd() | ||
# change the root path to the TTS root path | ||
os.chdir("../../../") | ||
|
||
### Definitions ### | ||
# dataset | ||
VCTK_PATH = "/raid/datasets/VCTK_NEW_16khz_removed_silence_silero_vad/" # download: https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zipdddddddddd | ||
RIR_SIMULATED_PATH = "/raid/datasets/DA/RIRS_NOISES/simulated_rirs/" # download: https://www.openslr.org/17/ | ||
MUSAN_PATH = "/raid/datasets/DA/musan/" # download: https://www.openslr.org/17/ | ||
|
||
# training | ||
OUTPUT_PATH = os.path.join( | ||
CURRENT_PATH, "resnet_speaker_encoder_training_output/" | ||
) # path to save the train logs and checkpoint | ||
CONFIG_OUT_PATH = os.path.join(OUTPUT_PATH, "config_se.json") | ||
RESTORE_PATH = None # Checkpoint to use for transfer learning if None ignore | ||
|
||
# instance the config | ||
# to speaker encoder | ||
config = SpeakerEncoderConfig() | ||
# to emotion encoder | ||
# config = EmotionEncoderConfig() | ||
|
||
|
||
#### DATASET CONFIG #### | ||
# The formatter need to return the key "speaker_name" for the speaker encoder and the "emotion_name" for the emotion encoder | ||
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", language="en-us", path=VCTK_PATH) | ||
|
||
# add the dataset to the config | ||
config.datasets = [dataset_config] | ||
|
||
|
||
#### TRAINING CONFIG #### | ||
# The encoder data loader balancer the dataset item equally to guarantee better training and to attend the losses requirements | ||
# It have two parameters to control the final batch size the number total of speaker used in each batch and the number of samples for each speaker | ||
|
||
# number total of speaker in batch in training | ||
config.num_classes_in_batch = 100 | ||
# number of utterance per class/speaker in the batch in training | ||
config.num_utter_per_class = 4 | ||
# final batch size = config.num_classes_in_batch * config.num_utter_per_class | ||
|
||
# number total of speaker in batch in evaluation | ||
config.eval_num_classes_in_batch = 100 | ||
# number of utterance per class/speaker in the batch in evaluation | ||
config.eval_num_utter_per_class = 4 | ||
|
||
# number of data loader workers | ||
config.num_loader_workers = 8 | ||
config.num_val_loader_workers = 8 | ||
|
||
# number of epochs | ||
config.epochs = 10000 | ||
# loss to be used in training | ||
config.loss = "softmaxproto" | ||
|
||
# run eval | ||
config.run_eval = False | ||
|
||
# output path for the checkpoints | ||
config.output_path = OUTPUT_PATH | ||
|
||
# Save local checkpoint every save_step steps | ||
config.save_step = 2000 | ||
|
||
### Model Config ### | ||
config.model_params = { | ||
"model_name": "resnet", # supported "lstm" and "resnet" | ||
"input_dim": 64, | ||
"use_torch_spec": True, | ||
"log_input": True, | ||
"proj_dim": 512, # embedding dim | ||
} | ||
|
||
### Audio Config ### | ||
# To fast train the model divides the audio in small parts. it parameter defines the length in seconds of these "parts" | ||
config.voice_len = 2.0 | ||
# all others configs | ||
config.audio = { | ||
"fft_size": 512, | ||
"win_length": 400, | ||
"hop_length": 160, | ||
"frame_shift_ms": None, | ||
"frame_length_ms": None, | ||
"stft_pad_mode": "reflect", | ||
"sample_rate": 16000, | ||
"resample": False, | ||
"preemphasis": 0.97, | ||
"ref_level_db": 20, | ||
"do_sound_norm": False, | ||
"do_trim_silence": False, | ||
"trim_db": 60, | ||
"power": 1.5, | ||
"griffin_lim_iters": 60, | ||
"num_mels": 64, | ||
"mel_fmin": 0.0, | ||
"mel_fmax": 8000.0, | ||
"spec_gain": 20, | ||
"signal_norm": False, | ||
"min_level_db": -100, | ||
"symmetric_norm": False, | ||
"max_norm": 4.0, | ||
"clip_norm": False, | ||
"stats_path": None, | ||
"do_rms_norm": True, | ||
"db_level": -27.0, | ||
} | ||
|
||
|
||
### Augmentation Config ### | ||
config.audio_augmentation = { | ||
# additive noise and room impulse response (RIR) simulation similar to: https://arxiv.org/pdf/2009.14153.pdf | ||
"p": 0.5, # probability to the use of one of the augmentation - 0 means disabled | ||
"rir": {"rir_path": RIR_SIMULATED_PATH, "conv_mode": "full"}, # download: https://www.openslr.org/17/ | ||
"additive": { | ||
"sounds_path": MUSAN_PATH, | ||
"speech": {"min_snr_in_db": 13, "max_snr_in_db": 20, "min_num_noises": 1, "max_num_noises": 1}, | ||
"noise": {"min_snr_in_db": 0, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1}, | ||
"music": {"min_snr_in_db": 5, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1}, | ||
}, | ||
"gaussian": {"p": 0.7, "min_amplitude": 0.0, "max_amplitude": 1e-05}, | ||
} | ||
|
||
config.save_json(CONFIG_OUT_PATH) | ||
|
||
print(CONFIG_OUT_PATH) | ||
if RESTORE_PATH is not None: | ||
command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH} --restore_path {RESTORE_PATH}" | ||
else: | ||
command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH}" | ||
|
||
os.system(command) |