From 774c4c1743506186333c9b8b40a3cbb362c36b53 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Wed, 22 Nov 2023 18:11:52 -0300 Subject: [PATCH 01/22] Add XTTS FT demo data processing pipeline --- TTS/demos/xtts_ft_demo/utils/formatter.py | 151 ++++++++++++++++++++ TTS/demos/xtts_ft_demo/xtts_demo.py | 161 ++++++++++++++++++++++ 2 files changed, 312 insertions(+) create mode 100644 TTS/demos/xtts_ft_demo/utils/formatter.py create mode 100644 TTS/demos/xtts_ft_demo/xtts_demo.py diff --git a/TTS/demos/xtts_ft_demo/utils/formatter.py b/TTS/demos/xtts_ft_demo/utils/formatter.py new file mode 100644 index 0000000000..95bb0f1b83 --- /dev/null +++ b/TTS/demos/xtts_ft_demo/utils/formatter.py @@ -0,0 +1,151 @@ +import os +import torchaudio +import pandas +from faster_whisper import WhisperModel +from glob import glob + +from tqdm import tqdm + +import torch +import torchaudio +from torchaudio.backend.sox_io_backend import load as torchaudio_sox_load +from torchaudio.backend.soundfile_backend import load as torchaudio_soundfile_load +# torch.set_num_threads(1) + +from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners + +torch.set_num_threads(16) + + +import os + +audio_types = (".wav", ".mp3", ".flac") + + +def list_audios(basePath, contains=None): + # return the set of files that are valid + return list_files(basePath, validExts=audio_types, contains=contains) + +def list_files(basePath, validExts=None, contains=None): + # loop over the directory structure + for (rootDir, dirNames, filenames) in os.walk(basePath): + # loop over the filenames in the current directory + for filename in filenames: + # if the contains string is not none and the filename does not contain + # the supplied string, then ignore the file + if contains is not None and filename.find(contains) == -1: + continue + + # determine the file extension of the current file + ext = filename[filename.rfind("."):].lower() + + # check to see if the file is an audio and should be processed + if validExts is None or ext.endswith(validExts): + # construct the path to the audio and yield it + audioPath = os.path.join(rootDir, filename) + yield audioPath + +def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0.5, eval_percentage=0.15, speaker_name="coqui", gradio_progress=None): + # make sure that ooutput file exists + os.makedirs(out_path, exist_ok=True) + + # Loading Whisper + device = "cuda" if torch.cuda.is_available() else "cpu" + + print("Loading Whisper Model!") + asr_model = WhisperModel("large-v2", device=device, compute_type="float16") + + metadata = {"audio_file": [], "text": [], "speaker_name": []} + + if gradio_progress is not None: + tqdm_object = gradio_progress.tqdm(audio_files, desc="Formatting...") + else: + tqdm_object = tqdm(audio_files) + + for audio_path in tqdm_object: + wav, sr = torchaudio.load(audio_path) + wav = wav.squeeze() + segments, info = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language) + segments = list(segments) + i = 0 + sentence = "" + sentence_start = None + first_word = True + # added all segments words in a unique list + words_list = [] + for _, segment in enumerate(segments): + words = list(segment.words) + words_list.extend(words) + + # process each word + for word_idx, word in enumerate(words_list): + if first_word: + sentence_start = word.start + # If it is the first sentence, add buffer or get the begining of the file + if word_idx == 0: + sentence_start = max(sentence_start - buffer, 0) # Add buffer to the sentence start + else: + # get previous sentence end + previous_word_end = words_list[word_idx - 1].end + # add buffer or get the silence midle between the previous sentence and the current one + sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start)/2) + + sentence = word.word + first_word = False + else: + sentence += word.word + + if word.word[-1] in ["!", ".", "?"]: + sentence = sentence[1:] + # Expand number and abbreviations plus normalization + sentence = multilingual_cleaners(sentence, target_language) + audio_file_name, ext = os.path.splitext(os.path.basename(audio_path)) + + audio_file = f"wavs/{audio_file_name}_{str(i).zfill(8)}{ext}" + + # Check for the next word's existence + if word_idx + 1 < len(words_list): + next_word_start = words_list[word_idx + 1].start + else: + # If don't have more words it means that it is the last sentence then use the audio len as next word start + next_word_start = (wav.shape[0] - 1) / sr + + # Average the current word end and next word start + word_end = min((word.end + next_word_start) / 2, word.end + buffer) + + absoulte_path = os.path.join(out_path, audio_file) + os.makedirs(os.path.dirname(absoulte_path), exist_ok=True) + i += 1 + first_word = True + + audio = wav[int(sr*sentence_start):int(sr*word_end)].unsqueeze(0) + # if the audio is too short ignore it (i.e < 0.33 seconds) + if audio.size(-1) >= sr/3: + torchaudio.backend.sox_io_backend.save( + absoulte_path, + audio, + sr + ) + else: + continue + + metadata["audio_file"].append(audio_file) + metadata["text"].append(sentence) + metadata["speaker_name"].append(speaker_name) + + df = pandas.DataFrame(metadata) + df = df.sample(frac=1) + num_val_samples = int(len(df)*eval_percentage) + + df_eval = df[:num_val_samples] + df_train = df[num_val_samples:] + + df_train = df_train.sort_values('audio_file') + train_metadata_path = os.path.join(out_path, "metadata_train.csv") + df_train.to_csv(train_metadata_path, sep="|", index=False) + + eval_metadata_path = os.path.join(out_path, "metadata_eval.csv") + df_eval = df_eval.sort_values('audio_file') + df_eval.to_csv(eval_metadata_path, sep="|", index=False) + + return train_metadata_path, eval_metadata_path \ No newline at end of file diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py new file mode 100644 index 0000000000..99b6479293 --- /dev/null +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -0,0 +1,161 @@ +import os +import sys +import tempfile + +import gradio as gr +import librosa.display +import numpy as np + +import os +import torch +import torchaudio +from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list, list_audios + +import logging + +PORT = 5003 + + + +def run_tts(lang, tts_text, state_vars, temperature, rms_norm_output=False): + return None + +# define a logger to redirect +class Logger: + def __init__(self, filename="log.out"): + self.log_file = filename + self.terminal = sys.stdout + self.log = open(self.log_file, "w") + + def write(self, message): + self.terminal.write(message) + self.log.write(message) + + def flush(self): + self.terminal.flush() + self.log.flush() + + def isatty(self): + return False + +# redirect stdout and stderr to a file +sys.stdout = Logger() +sys.stderr = sys.stdout + + +def read_logs(): + sys.stdout.flush() + with open(sys.stdout.log_file, "r") as f: + return f.read() + + +with gr.Blocks() as demo: + with gr.Tab("XTTS"): + state_vars = gr.State( + ) + with gr.Row(): + with gr.Column() as col1: + upload_file = gr.Audio( + sources="upload", + label="Select here the audio files that you want to use for XTTS trainining !", + type="filepath", + ) + lang = gr.Dropdown( + label="Dataset Language", + value="en", + choices=[ + "en", + "es", + "fr", + "de", + "it", + "pt", + "pl", + "tr", + "ru", + "nl", + "cs", + "ar", + "zh", + "hu", + "ko", + "ja" + ], + ) + voice_ready = gr.Label( + label="Progress." + ) + logs = gr.Textbox( + label="Logs:", + interactive=False, + ) + demo.load(read_logs, None, logs, every=1) + + prompt_compute_btn = gr.Button(value="Step 1 - Create dataset.") + + with gr.Column() as col2: + + tts_text = gr.Textbox( + label="Input Text.", + value="This model sounds really good and above all, it's reasonably fast.", + ) + temperature = gr.Slider( + label="temperature", minimum=0.00001, maximum=1.0, step=0.05, value=0.75 + ) + rms_norm_output = gr.Checkbox( + label="RMS norm output.", value=True, interactive=True + ) + tts_btn = gr.Button(value="Step 2 - TTS") + + with gr.Column() as col3: + tts_output_audio_no_enhanced = gr.Audio(label="HiFi-GAN.") + tts_output_audio_no_enhanced_ft = gr.Audio(label="HiFi-GAN new.") + reference_audio = gr.Audio(label="Reference Speech used.") + + def preprocess_dataset(audio_path, language, state_vars, progress=gr.Progress(track_tqdm=True)): + # create a temp directory to save the dataset + out_path = tempfile.TemporaryDirectory().name + if audio_path is None: + # ToDo: raise an error + pass + else: + + train_meta, eval_meta = format_audio_list([audio_path], target_language=language, out_path=out_path, gradio_progress=progress) + + state_vars = {} + state_vars["train_csv"] = train_meta + state_vars["eval_csv"] = eval_meta + return "Dataset Processed!", state_vars + + prompt_compute_btn.click( + fn=preprocess_dataset, + inputs=[ + upload_file, + lang, + state_vars, + ], + outputs=[ + voice_ready, + state_vars, + ], + ) + + tts_btn.click( + fn=run_tts, + inputs=[ + lang, + tts_text, + state_vars, + temperature, + rms_norm_output, + ], + outputs=[tts_output_audio_no_enhanced, tts_output_audio_no_enhanced_ft], + ) + +if __name__ == "__main__": + demo.launch( + share=True, + debug=True, + server_port=PORT, + server_name="0.0.0.0" + ) From cc4f37e1b0bb270b1cd5883b2de0cf3bee279f62 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Thu, 23 Nov 2023 16:30:49 -0300 Subject: [PATCH 02/22] Add training and inference columns --- TTS/demos/xtts_ft_demo/utils/gpt_train.py | 163 +++++++++++++++++++ TTS/demos/xtts_ft_demo/xtts_demo.py | 178 ++++++++++++++++++--- TTS/tts/layers/xtts/trainer/gpt_trainer.py | 2 +- 3 files changed, 319 insertions(+), 24 deletions(-) create mode 100644 TTS/demos/xtts_ft_demo/utils/gpt_train.py diff --git a/TTS/demos/xtts_ft_demo/utils/gpt_train.py b/TTS/demos/xtts_ft_demo/utils/gpt_train.py new file mode 100644 index 0000000000..a4f5cb9a10 --- /dev/null +++ b/TTS/demos/xtts_ft_demo/utils/gpt_train.py @@ -0,0 +1,163 @@ +import os + +from trainer import Trainer, TrainerArgs + +from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.datasets import load_tts_samples +from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig +from TTS.utils.manage import ModelManager + + +def train_gpt(language, num_epochs, batch_size, train_csv, eval_csv, output_path): + # Logging parameters + RUN_NAME = "GPT_XTTSv2.1_FT" + PROJECT_NAME = "XTTS_trainer" + DASHBOARD_LOGGER = "tensorboard" + LOGGER_URI = None + + # Set here the path that the checkpoints will be saved. Default: ./run/training/ + OUT_PATH = os.path.join(output_path, "run", "training") + + + # Training Parameters + OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False + START_WITH_EVAL = True # if True it will star with evaluation + BATCH_SIZE = batch_size # set here the batch size + GRAD_ACUMM_STEPS = 1 # set here the grad accumulation steps + # Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly. + + + # Define here the dataset that you want to use for the fine-tuning on. + config_dataset = BaseDatasetConfig( + formatter="coqui", + dataset_name="ft_dataset", + path=os.path.dirname(train_csv), + meta_file_train=train_csv, + meta_file_val=eval_csv, + language=language, + ) + + # Add here the configs of the datasets + DATASETS_CONFIG_LIST = [config_dataset] + + # Define the path where XTTS v2.0.1 files will be downloaded + CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/") + os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True) + + + # DVAE files + DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth" + MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth" + + # Set the path to the downloaded files + DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK)) + MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK)) + + # download DVAE files if needed + if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE): + print(" > Downloading DVAE files!") + ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True) + + + # Download XTTS v2.0 checkpoint if needed + TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json" + XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth" + XTTS_CONFIG_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json" + + # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning. + TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file + XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)) # model.pth file + XTTS_CONFIG_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CONFIG_LINK)) # config.json file + + # download XTTS v2.0 files if needed + if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT): + print(" > Downloading XTTS v2.0 files!") + ModelManager._download_model_files( + [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK, XTTS_CONFIG_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True + ) + + # init args and config + model_args = GPTArgs( + max_conditioning_length=132300, # 6 secs + min_conditioning_length=66150, # 3 secs + debug_loading_failures=False, + max_wav_length=255995, # ~11.6 seconds + max_text_length=200, + mel_norm_file=MEL_NORM_FILE, + dvae_checkpoint=DVAE_CHECKPOINT, + xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune + tokenizer_file=TOKENIZER_FILE, + gpt_num_audio_tokens=1026, + gpt_start_audio_token=1024, + gpt_stop_audio_token=1025, + gpt_use_masking_gt_prompt_approach=True, + gpt_use_perceiver_resampler=True, + ) + # define audio config + audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000) + # training parameters config + config = GPTTrainerConfig( + epochs=num_epochs, + output_path=OUT_PATH, + model_args=model_args, + run_name=RUN_NAME, + project_name=PROJECT_NAME, + run_description=""" + GPT XTTS training + """, + dashboard_logger=DASHBOARD_LOGGER, + logger_uri=LOGGER_URI, + audio=audio_config, + batch_size=BATCH_SIZE, + batch_group_size=48, + eval_batch_size=BATCH_SIZE, + num_loader_workers=8, + eval_split_max_size=256, + print_step=50, + plot_step=100, + log_model_step=100, + save_step=1000, + save_n_checkpoints=1, + save_checkpoints=True, + # target_loss="loss", + print_eval=False, + # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters. + optimizer="AdamW", + optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS, + optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2}, + lr=5e-06, # learning rate + lr_scheduler="MultiStepLR", + # it was adjusted accordly for the new step scheme + lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1}, + test_sentences=[], + ) + + # init the model from config + model = GPTTrainer.init_from_config(config) + + # load training samples + train_samples, eval_samples = load_tts_samples( + DATASETS_CONFIG_LIST, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, + ) + + # init the trainer and 🚀 + trainer = Trainer( + TrainerArgs( + restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter + skip_train_epoch=False, + start_with_eval=START_WITH_EVAL, + grad_accum_steps=GRAD_ACUMM_STEPS, + ), + config, + output_path=OUT_PATH, + model=model, + train_samples=train_samples, + eval_samples=eval_samples, + ) + trainer.fit() + + + return XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer.output_path, train_samples[0]["audio_file"] diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py index 99b6479293..7e6e1c0944 100644 --- a/TTS/demos/xtts_ft_demo/xtts_demo.py +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -10,15 +10,50 @@ import torch import torchaudio from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list, list_audios +from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt + +from TTS.tts.configs.xtts_config import XttsConfig +from TTS.tts.models.xtts import Xtts -import logging PORT = 5003 +def load_model(xtts_checkpoint, xtts_config, xtts_vocab): + config = XttsConfig() + config.load_json(xtts_config) + model = Xtts.init_from_config(config) + print("Loading XTTS model! ") + model.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False) + if torch.cuda.is_available(): + model.cuda() + return model + +def run_tts(lang, tts_text, xtts_checkpoint, xtts_config, xtts_vocab, speaker_audio_file, state_vars): + # ToDo: add the load in other function to fast inference + model = load_model(xtts_checkpoint, xtts_config, xtts_vocab) + gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=model.config.gpt_cond_len, max_ref_length=model.config.max_ref_len, sound_norm_refs=model.config.sound_norm_refs) + speaker_embedding + out = model.inference( + text=tts_text, + language=lang, + gpt_cond_latent=gpt_cond_latent, + speaker_embedding=speaker_embedding, + temperature=model.config.temperature, # Add custom parameters here + length_penalty=model.config.length_penalty, + repetition_penalty=model.config.repetition_penalty, + top_k=model.config.top_k, + top_p=model.config.top_p, + ) + + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: + out["wav"] = torch.tensor(out["wav"]).unsqueeze(0) + out_path = fp.name + torchaudio.save(out_path, out["wav"], 24000) + + return out_path, speaker_audio_file + -def run_tts(lang, tts_text, state_vars, temperature, rms_norm_output=False): - return None # define a logger to redirect class Logger: @@ -43,6 +78,16 @@ def isatty(self): sys.stderr = sys.stdout +# logging.basicConfig(stream=sys.stdout, level=logging.INFO) +import logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + handlers=[ + logging.StreamHandler(sys.stdout) + ] +) + def read_logs(): sys.stdout.flush() with open(sys.stdout.log_file, "r") as f: @@ -82,8 +127,8 @@ def read_logs(): "ja" ], ) - voice_ready = gr.Label( - label="Progress." + progress_data = gr.Label( + label="Progress:" ) logs = gr.Textbox( label="Logs:", @@ -94,23 +139,78 @@ def read_logs(): prompt_compute_btn = gr.Button(value="Step 1 - Create dataset.") with gr.Column() as col2: - + num_epochs = gr.Slider( + label="num_epochs", + minimum=1, + maximum=100, + step=1, + value=2,# 15 + ) + batch_size = gr.Slider( + label="batch_size", + minimum=2, + maximum=512, + step=1, + value=15, + ) + progress_train = gr.Label( + label="Progress:" + ) + logs_tts_train = gr.Textbox( + label="Logs:", + interactive=False, + ) + demo.load(read_logs, None, logs_tts_train, every=1) + train_btn = gr.Button(value="Step 2 - Run the training") + + with gr.Column() as col3: + xtts_checkpoint = gr.Textbox( + label="XTTS checkpoint path:", + value="", + ) + xtts_config = gr.Textbox( + label="XTTS config path:", + value="", + ) + xtts_vocab = gr.Textbox( + label="XTTS config path:", + value="", + ) + speaker_reference_audio = gr.Textbox( + label="Speaker reference audio:", + value="", + ) + tts_language = gr.Dropdown( + label="Language", + value="en", + choices=[ + "en", + "es", + "fr", + "de", + "it", + "pt", + "pl", + "tr", + "ru", + "nl", + "cs", + "ar", + "zh", + "hu", + "ko", + "ja", + ] + ) tts_text = gr.Textbox( label="Input Text.", value="This model sounds really good and above all, it's reasonably fast.", ) - temperature = gr.Slider( - label="temperature", minimum=0.00001, maximum=1.0, step=0.05, value=0.75 - ) - rms_norm_output = gr.Checkbox( - label="RMS norm output.", value=True, interactive=True - ) - tts_btn = gr.Button(value="Step 2 - TTS") + tts_btn = gr.Button(value="Step 3 - Inference XTTS model") + + tts_output_audio = gr.Audio(label="Generated Audio.") + reference_audio = gr.Audio(label="Reference audio used.") - with gr.Column() as col3: - tts_output_audio_no_enhanced = gr.Audio(label="HiFi-GAN.") - tts_output_audio_no_enhanced_ft = gr.Audio(label="HiFi-GAN new.") - reference_audio = gr.Audio(label="Reference Speech used.") def preprocess_dataset(audio_path, language, state_vars, progress=gr.Progress(track_tqdm=True)): # create a temp directory to save the dataset @@ -119,12 +219,12 @@ def preprocess_dataset(audio_path, language, state_vars, progress=gr.Progress(tr # ToDo: raise an error pass else: - train_meta, eval_meta = format_audio_list([audio_path], target_language=language, out_path=out_path, gradio_progress=progress) state_vars = {} state_vars["train_csv"] = train_meta state_vars["eval_csv"] = eval_meta + print(state_vars) return "Dataset Processed!", state_vars prompt_compute_btn.click( @@ -135,23 +235,55 @@ def preprocess_dataset(audio_path, language, state_vars, progress=gr.Progress(tr state_vars, ], outputs=[ - voice_ready, + progress_data, state_vars, ], ) + + def train_model(language, num_epochs, batch_size, state_vars, output_path="./", progress=gr.Progress(track_tqdm=True)): + # state_vars = {'train_csv': '/tmp/tmprh4k_vou/metadata_train.csv', 'eval_csv': '/tmp/tmprh4k_vou/metadata_eval.csv'} + + config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, state_vars["train_csv"], state_vars["eval_csv"], output_path=output_path) + # copy original files to avoid parameters changes issues + os.system(f"cp {config_path} {exp_path}") + os.system(f"cp {vocab_file} {exp_path}") + + ft_xtts_checkpoint = os.path.join(exp_path, "best_model.pth") + state_vars["config_path"] = config_path + state_vars["original_xtts_checkpoint"] = original_xtts_checkpoint + state_vars["vocab_file"] = vocab_file + state_vars["ft_xtts_checkpoint"] = ft_xtts_checkpoint + state_vars["speaker_audio_file"] = speaker_wav + return "Model training done!", state_vars, config_path, vocab_file, ft_xtts_checkpoint, speaker_wav + + train_btn.click( + fn=train_model, + inputs=[ + lang, + num_epochs, + batch_size, + state_vars, + ], + outputs=[progress_train, state_vars, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio], + ) + tts_btn.click( fn=run_tts, inputs=[ - lang, + tts_language, tts_text, + xtts_checkpoint, + xtts_config, + xtts_vocab, + speaker_reference_audio, state_vars, - temperature, - rms_norm_output, ], - outputs=[tts_output_audio_no_enhanced, tts_output_audio_no_enhanced_ft], + outputs=[tts_output_audio, reference_audio], ) + + if __name__ == "__main__": demo.launch( share=True, diff --git a/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/TTS/tts/layers/xtts/trainer/gpt_trainer.py index 4789e1f43f..671be8eb7f 100644 --- a/TTS/tts/layers/xtts/trainer/gpt_trainer.py +++ b/TTS/tts/layers/xtts/trainer/gpt_trainer.py @@ -225,11 +225,11 @@ def forward(self, text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels @torch.no_grad() def test_run(self, assets) -> Tuple[Dict, Dict]: # pylint: disable=W0613 + test_audios = {} if self.config.test_sentences: # init gpt for inference mode self.xtts.gpt.init_gpt_for_inference(kv_cache=self.args.kv_cache, use_deepspeed=False) self.xtts.gpt.eval() - test_audios = {} print(" | > Synthesizing test sentences.") for idx, s_info in enumerate(self.config.test_sentences): wav = self.xtts.synthesize( From 7cc348ed763c7f8e0d9ff045b6326e15b38c3e4e Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Thu, 23 Nov 2023 17:50:41 -0300 Subject: [PATCH 03/22] Uses tabs instead of columns --- TTS/demos/xtts_ft_demo/requirements.txt | 1 + TTS/demos/xtts_ft_demo/xtts_demo.py | 229 ++++++++++++------------ 2 files changed, 115 insertions(+), 115 deletions(-) create mode 100644 TTS/demos/xtts_ft_demo/requirements.txt diff --git a/TTS/demos/xtts_ft_demo/requirements.txt b/TTS/demos/xtts_ft_demo/requirements.txt new file mode 100644 index 0000000000..8360accff2 --- /dev/null +++ b/TTS/demos/xtts_ft_demo/requirements.txt @@ -0,0 +1 @@ +faster_whisper \ No newline at end of file diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py index 7e6e1c0944..016a929e2c 100644 --- a/TTS/demos/xtts_ft_demo/xtts_demo.py +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -95,123 +95,46 @@ def read_logs(): with gr.Blocks() as demo: - with gr.Tab("XTTS"): - state_vars = gr.State( + state_vars = gr.State() + with gr.Tab("Data processing"): + upload_file = gr.Audio( + sources="upload", + label="Select here the audio files that you want to use for XTTS trainining !", + type="filepath", ) - with gr.Row(): - with gr.Column() as col1: - upload_file = gr.Audio( - sources="upload", - label="Select here the audio files that you want to use for XTTS trainining !", - type="filepath", - ) - lang = gr.Dropdown( - label="Dataset Language", - value="en", - choices=[ - "en", - "es", - "fr", - "de", - "it", - "pt", - "pl", - "tr", - "ru", - "nl", - "cs", - "ar", - "zh", - "hu", - "ko", - "ja" - ], - ) - progress_data = gr.Label( - label="Progress:" - ) - logs = gr.Textbox( - label="Logs:", - interactive=False, - ) - demo.load(read_logs, None, logs, every=1) - - prompt_compute_btn = gr.Button(value="Step 1 - Create dataset.") - - with gr.Column() as col2: - num_epochs = gr.Slider( - label="num_epochs", - minimum=1, - maximum=100, - step=1, - value=2,# 15 - ) - batch_size = gr.Slider( - label="batch_size", - minimum=2, - maximum=512, - step=1, - value=15, - ) - progress_train = gr.Label( - label="Progress:" - ) - logs_tts_train = gr.Textbox( - label="Logs:", - interactive=False, - ) - demo.load(read_logs, None, logs_tts_train, every=1) - train_btn = gr.Button(value="Step 2 - Run the training") - - with gr.Column() as col3: - xtts_checkpoint = gr.Textbox( - label="XTTS checkpoint path:", - value="", - ) - xtts_config = gr.Textbox( - label="XTTS config path:", - value="", - ) - xtts_vocab = gr.Textbox( - label="XTTS config path:", - value="", - ) - speaker_reference_audio = gr.Textbox( - label="Speaker reference audio:", - value="", - ) - tts_language = gr.Dropdown( - label="Language", - value="en", - choices=[ - "en", - "es", - "fr", - "de", - "it", - "pt", - "pl", - "tr", - "ru", - "nl", - "cs", - "ar", - "zh", - "hu", - "ko", - "ja", - ] - ) - tts_text = gr.Textbox( - label="Input Text.", - value="This model sounds really good and above all, it's reasonably fast.", - ) - tts_btn = gr.Button(value="Step 3 - Inference XTTS model") - - tts_output_audio = gr.Audio(label="Generated Audio.") - reference_audio = gr.Audio(label="Reference audio used.") - + lang = gr.Dropdown( + label="Dataset Language", + value="en", + choices=[ + "en", + "es", + "fr", + "de", + "it", + "pt", + "pl", + "tr", + "ru", + "nl", + "cs", + "ar", + "zh", + "hu", + "ko", + "ja" + ], + ) + progress_data = gr.Label( + label="Progress:" + ) + logs = gr.Textbox( + label="Logs:", + interactive=False, + ) + demo.load(read_logs, None, logs, every=1) + prompt_compute_btn = gr.Button(value="Step 1 - Create dataset.") + def preprocess_dataset(audio_path, language, state_vars, progress=gr.Progress(track_tqdm=True)): # create a temp directory to save the dataset out_path = tempfile.TemporaryDirectory().name @@ -240,6 +163,32 @@ def preprocess_dataset(audio_path, language, state_vars, progress=gr.Progress(tr ], ) + + with gr.Tab("Fine-tuning XTTS"): + num_epochs = gr.Slider( + label="num_epochs", + minimum=1, + maximum=100, + step=1, + value=2,# 15 + ) + batch_size = gr.Slider( + label="batch_size", + minimum=2, + maximum=512, + step=1, + value=15, + ) + progress_train = gr.Label( + label="Progress:" + ) + logs_tts_train = gr.Textbox( + label="Logs:", + interactive=False, + ) + demo.load(read_logs, None, logs_tts_train, every=1) + train_btn = gr.Button(value="Step 2 - Run the training") + def train_model(language, num_epochs, batch_size, state_vars, output_path="./", progress=gr.Progress(track_tqdm=True)): # state_vars = {'train_csv': '/tmp/tmprh4k_vou/metadata_train.csv', 'eval_csv': '/tmp/tmprh4k_vou/metadata_eval.csv'} @@ -257,6 +206,55 @@ def train_model(language, num_epochs, batch_size, state_vars, output_path="./", return "Model training done!", state_vars, config_path, vocab_file, ft_xtts_checkpoint, speaker_wav + with gr.Tab("Inference"): + xtts_checkpoint = gr.Textbox( + label="XTTS checkpoint path:", + value="", + ) + xtts_config = gr.Textbox( + label="XTTS config path:", + value="", + ) + xtts_vocab = gr.Textbox( + label="XTTS config path:", + value="", + ) + speaker_reference_audio = gr.Textbox( + label="Speaker reference audio:", + value="", + ) + tts_language = gr.Dropdown( + label="Language", + value="en", + choices=[ + "en", + "es", + "fr", + "de", + "it", + "pt", + "pl", + "tr", + "ru", + "nl", + "cs", + "ar", + "zh", + "hu", + "ko", + "ja", + ] + ) + tts_text = gr.Textbox( + label="Input Text.", + value="This model sounds really good and above all, it's reasonably fast.", + ) + tts_btn = gr.Button(value="Step 3 - Inference XTTS model") + + tts_output_audio = gr.Audio(label="Generated Audio.") + reference_audio = gr.Audio(label="Reference audio used.") + + train_btn.click( fn=train_model, inputs=[ @@ -268,6 +266,7 @@ def train_model(language, num_epochs, batch_size, state_vars, output_path="./", outputs=[progress_train, state_vars, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio], ) + tts_btn.click( fn=run_tts, inputs=[ From 626d9e16fb35061c3840276a26c08077f1fab309 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 24 Nov 2023 08:44:21 -0300 Subject: [PATCH 04/22] Fix demo freezing issue --- TTS/demos/xtts_ft_demo/requirements.txt | 3 +- TTS/demos/xtts_ft_demo/xtts_demo.py | 71 +++++++++++++------------ 2 files changed, 39 insertions(+), 35 deletions(-) diff --git a/TTS/demos/xtts_ft_demo/requirements.txt b/TTS/demos/xtts_ft_demo/requirements.txt index 8360accff2..cb5b16f66e 100644 --- a/TTS/demos/xtts_ft_demo/requirements.txt +++ b/TTS/demos/xtts_ft_demo/requirements.txt @@ -1 +1,2 @@ -faster_whisper \ No newline at end of file +faster_whisper==0.9.0 +gradio==4.7.1 \ No newline at end of file diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py index 016a929e2c..6fee1a5050 100644 --- a/TTS/demos/xtts_ft_demo/xtts_demo.py +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -28,7 +28,7 @@ def load_model(xtts_checkpoint, xtts_config, xtts_vocab): model.cuda() return model -def run_tts(lang, tts_text, xtts_checkpoint, xtts_config, xtts_vocab, speaker_audio_file, state_vars): +def run_tts(lang, tts_text, xtts_checkpoint, xtts_config, xtts_vocab, speaker_audio_file): # ToDo: add the load in other function to fast inference model = load_model(xtts_checkpoint, xtts_config, xtts_vocab) gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=model.config.gpt_cond_len, max_ref_length=model.config.max_ref_len, sound_norm_refs=model.config.sound_norm_refs) @@ -95,7 +95,7 @@ def read_logs(): with gr.Blocks() as demo: - state_vars = gr.State() + # state_vars = gr.State() with gr.Tab("Data processing"): upload_file = gr.Audio( sources="upload", @@ -135,7 +135,7 @@ def read_logs(): prompt_compute_btn = gr.Button(value="Step 1 - Create dataset.") - def preprocess_dataset(audio_path, language, state_vars, progress=gr.Progress(track_tqdm=True)): + def preprocess_dataset(audio_path, language, progress=gr.Progress(track_tqdm=True)): # create a temp directory to save the dataset out_path = tempfile.TemporaryDirectory().name if audio_path is None: @@ -144,27 +144,15 @@ def preprocess_dataset(audio_path, language, state_vars, progress=gr.Progress(tr else: train_meta, eval_meta = format_audio_list([audio_path], target_language=language, out_path=out_path, gradio_progress=progress) - state_vars = {} - state_vars["train_csv"] = train_meta - state_vars["eval_csv"] = eval_meta - print(state_vars) - return "Dataset Processed!", state_vars - - prompt_compute_btn.click( - fn=preprocess_dataset, - inputs=[ - upload_file, - lang, - state_vars, - ], - outputs=[ - progress_data, - state_vars, - ], - ) - + return "Dataset Processed!", train_meta, eval_meta with gr.Tab("Fine-tuning XTTS"): + train_csv = gr.Textbox( + label="Train CSV:", + ) + eval_csv = gr.Textbox( + label="Eval CSV:", + ) num_epochs = gr.Slider( label="num_epochs", minimum=1, @@ -189,21 +177,22 @@ def preprocess_dataset(audio_path, language, state_vars, progress=gr.Progress(tr demo.load(read_logs, None, logs_tts_train, every=1) train_btn = gr.Button(value="Step 2 - Run the training") - def train_model(language, num_epochs, batch_size, state_vars, output_path="./", progress=gr.Progress(track_tqdm=True)): - # state_vars = {'train_csv': '/tmp/tmprh4k_vou/metadata_train.csv', 'eval_csv': '/tmp/tmprh4k_vou/metadata_eval.csv'} + def train_model(language, train_csv, eval_csv, num_epochs, batch_size, output_path="./", progress=gr.Progress(track_tqdm=True)): + # train_csv = '/tmp/tmprh4k_vou/metadata_train.csv' + # eval_csv = '/tmp/tmprh4k_vou/metadata_eval.csv' - config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, state_vars["train_csv"], state_vars["eval_csv"], output_path=output_path) + config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, train_csv, eval_csv, output_path=output_path) # copy original files to avoid parameters changes issues os.system(f"cp {config_path} {exp_path}") os.system(f"cp {vocab_file} {exp_path}") ft_xtts_checkpoint = os.path.join(exp_path, "best_model.pth") - state_vars["config_path"] = config_path - state_vars["original_xtts_checkpoint"] = original_xtts_checkpoint - state_vars["vocab_file"] = vocab_file - state_vars["ft_xtts_checkpoint"] = ft_xtts_checkpoint - state_vars["speaker_audio_file"] = speaker_wav - return "Model training done!", state_vars, config_path, vocab_file, ft_xtts_checkpoint, speaker_wav + # state_vars["config_path"] = config_path + # state_vars["original_xtts_checkpoint"] = original_xtts_checkpoint + # state_vars["vocab_file"] = vocab_file + # state_vars["ft_xtts_checkpoint"] = ft_xtts_checkpoint + # state_vars["speaker_audio_file"] = speaker_wav + return "Model training done!", config_path, vocab_file, ft_xtts_checkpoint, speaker_wav with gr.Tab("Inference"): @@ -254,16 +243,31 @@ def train_model(language, num_epochs, batch_size, state_vars, output_path="./", tts_output_audio = gr.Audio(label="Generated Audio.") reference_audio = gr.Audio(label="Reference audio used.") + prompt_compute_btn.click( + fn=preprocess_dataset, + inputs=[ + upload_file, + lang, + ], + outputs=[ + progress_data, + train_csv, + eval_csv, + ], + ) + + train_btn.click( fn=train_model, inputs=[ lang, + train_csv, + eval_csv, num_epochs, batch_size, - state_vars, ], - outputs=[progress_train, state_vars, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio], + outputs=[progress_train, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio], ) @@ -276,7 +280,6 @@ def train_model(language, num_epochs, batch_size, state_vars, output_path="./", xtts_config, xtts_vocab, speaker_reference_audio, - state_vars, ], outputs=[tts_output_audio, reference_audio], ) From fa9bb26ebb2cb4ecb1e37a3b4fad608ee9ddc96c Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 24 Nov 2023 10:22:12 -0300 Subject: [PATCH 05/22] Update demo --- TTS/demos/xtts_ft_demo/utils/formatter.py | 10 +- TTS/demos/xtts_ft_demo/utils/gpt_train.py | 6 +- TTS/demos/xtts_ft_demo/xtts_demo.py | 190 ++++++++++++---------- 3 files changed, 114 insertions(+), 92 deletions(-) diff --git a/TTS/demos/xtts_ft_demo/utils/formatter.py b/TTS/demos/xtts_ft_demo/utils/formatter.py index 95bb0f1b83..03db6c2ca3 100644 --- a/TTS/demos/xtts_ft_demo/utils/formatter.py +++ b/TTS/demos/xtts_ft_demo/utils/formatter.py @@ -8,8 +8,6 @@ import torch import torchaudio -from torchaudio.backend.sox_io_backend import load as torchaudio_sox_load -from torchaudio.backend.soundfile_backend import load as torchaudio_soundfile_load # torch.set_num_threads(1) from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners @@ -45,7 +43,7 @@ def list_files(basePath, validExts=None, contains=None): audioPath = os.path.join(rootDir, filename) yield audioPath -def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0.5, eval_percentage=0.15, speaker_name="coqui", gradio_progress=None): +def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0.2, eval_percentage=0.15, speaker_name="coqui", gradio_progress=None): # make sure that ooutput file exists os.makedirs(out_path, exist_ok=True) @@ -121,10 +119,10 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0 audio = wav[int(sr*sentence_start):int(sr*word_end)].unsqueeze(0) # if the audio is too short ignore it (i.e < 0.33 seconds) if audio.size(-1) >= sr/3: - torchaudio.backend.sox_io_backend.save( - absoulte_path, + torchaudio.save(absoulte_path, audio, - sr + sr, + backend="sox", ) else: continue diff --git a/TTS/demos/xtts_ft_demo/utils/gpt_train.py b/TTS/demos/xtts_ft_demo/utils/gpt_train.py index a4f5cb9a10..1e7d5f367e 100644 --- a/TTS/demos/xtts_ft_demo/utils/gpt_train.py +++ b/TTS/demos/xtts_ft_demo/utils/gpt_train.py @@ -159,5 +159,9 @@ def train_gpt(language, num_epochs, batch_size, train_csv, eval_csv, output_path ) trainer.fit() + # get the longest text audio file to use as speaker reference + samples_len = [len(item["text"].split(" ")) for item in train_samples] + longest_text_idx = samples_len.index(max(samples_len)) + speaker_ref = train_samples[longest_text_idx]["audio_file"] - return XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer.output_path, train_samples[0]["audio_file"] + return XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer.output_path, speaker_ref diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py index 6fee1a5050..9dcaefceb9 100644 --- a/TTS/demos/xtts_ft_demo/xtts_demo.py +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -18,31 +18,32 @@ PORT = 5003 +XTTS_MODEL = None def load_model(xtts_checkpoint, xtts_config, xtts_vocab): + global XTTS_MODEL config = XttsConfig() config.load_json(xtts_config) - model = Xtts.init_from_config(config) + XTTS_MODEL = Xtts.init_from_config(config) print("Loading XTTS model! ") - model.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False) + XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False) if torch.cuda.is_available(): - model.cuda() - return model - -def run_tts(lang, tts_text, xtts_checkpoint, xtts_config, xtts_vocab, speaker_audio_file): - # ToDo: add the load in other function to fast inference - model = load_model(xtts_checkpoint, xtts_config, xtts_vocab) - gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=model.config.gpt_cond_len, max_ref_length=model.config.max_ref_len, sound_norm_refs=model.config.sound_norm_refs) - speaker_embedding - out = model.inference( + XTTS_MODEL.cuda() + + print("Model Loaded!") + return "Model Loaded!" + +def run_tts(lang, tts_text, speaker_audio_file): + gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs) + out = XTTS_MODEL.inference( text=tts_text, language=lang, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding, - temperature=model.config.temperature, # Add custom parameters here - length_penalty=model.config.length_penalty, - repetition_penalty=model.config.repetition_penalty, - top_k=model.config.top_k, - top_p=model.config.top_p, + temperature=XTTS_MODEL.config.temperature, # Add custom parameters here + length_penalty=XTTS_MODEL.config.length_penalty, + repetition_penalty=XTTS_MODEL.config.repetition_penalty, + top_k=XTTS_MODEL.config.top_k, + top_p=XTTS_MODEL.config.top_p, ) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: @@ -95,12 +96,19 @@ def read_logs(): with gr.Blocks() as demo: - # state_vars = gr.State() - with gr.Tab("Data processing"): - upload_file = gr.Audio( - sources="upload", - label="Select here the audio files that you want to use for XTTS trainining !", - type="filepath", + with gr.Tab("Data processing"): + out_path = gr.Textbox( + label="Output path (where data and checkpoints will be saved):", + value="/tmp/xtts_ft/" + ) + # upload_file = gr.Audio( + # sources="upload", + # label="Select here the audio files that you want to use for XTTS trainining !", + # type="filepath", + # ) + upload_file = gr.File( + file_count="multiple", + label="Select here the audio files that you want to use for XTTS trainining (Supported formats: wav, mp3, and flac)", ) lang = gr.Dropdown( label="Dataset Language", @@ -135,18 +143,18 @@ def read_logs(): prompt_compute_btn = gr.Button(value="Step 1 - Create dataset.") - def preprocess_dataset(audio_path, language, progress=gr.Progress(track_tqdm=True)): - # create a temp directory to save the dataset - out_path = tempfile.TemporaryDirectory().name + def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(track_tqdm=True)): + out_path = os.path.join(out_path, "dataset") + os.makedirs(out_path, exist_ok=True) if audio_path is None: # ToDo: raise an error pass else: - train_meta, eval_meta = format_audio_list([audio_path], target_language=language, out_path=out_path, gradio_progress=progress) - + train_meta, eval_meta = format_audio_list(audio_path, target_language=language, out_path=out_path, gradio_progress=progress) + print("Dataset Processed!") return "Dataset Processed!", train_meta, eval_meta - with gr.Tab("Fine-tuning XTTS"): + with gr.Tab("Fine-tuning XTTS Encoder"): train_csv = gr.Textbox( label="Train CSV:", ) @@ -158,7 +166,7 @@ def preprocess_dataset(audio_path, language, progress=gr.Progress(track_tqdm=Tru minimum=1, maximum=100, step=1, - value=2,# 15 + value=10, ) batch_size = gr.Slider( label="batch_size", @@ -177,7 +185,7 @@ def preprocess_dataset(audio_path, language, progress=gr.Progress(track_tqdm=Tru demo.load(read_logs, None, logs_tts_train, every=1) train_btn = gr.Button(value="Step 2 - Run the training") - def train_model(language, train_csv, eval_csv, num_epochs, batch_size, output_path="./", progress=gr.Progress(track_tqdm=True)): + def train_model(language, train_csv, eval_csv, num_epochs, batch_size, output_path, progress=gr.Progress(track_tqdm=True)): # train_csv = '/tmp/tmprh4k_vou/metadata_train.csv' # eval_csv = '/tmp/tmprh4k_vou/metadata_eval.csv' @@ -187,67 +195,73 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, output_pa os.system(f"cp {vocab_file} {exp_path}") ft_xtts_checkpoint = os.path.join(exp_path, "best_model.pth") - # state_vars["config_path"] = config_path - # state_vars["original_xtts_checkpoint"] = original_xtts_checkpoint - # state_vars["vocab_file"] = vocab_file - # state_vars["ft_xtts_checkpoint"] = ft_xtts_checkpoint - # state_vars["speaker_audio_file"] = speaker_wav + print("Model training done!") return "Model training done!", config_path, vocab_file, ft_xtts_checkpoint, speaker_wav with gr.Tab("Inference"): - xtts_checkpoint = gr.Textbox( - label="XTTS checkpoint path:", - value="", - ) - xtts_config = gr.Textbox( - label="XTTS config path:", - value="", - ) - xtts_vocab = gr.Textbox( - label="XTTS config path:", - value="", - ) - speaker_reference_audio = gr.Textbox( - label="Speaker reference audio:", - value="", - ) - tts_language = gr.Dropdown( - label="Language", - value="en", - choices=[ - "en", - "es", - "fr", - "de", - "it", - "pt", - "pl", - "tr", - "ru", - "nl", - "cs", - "ar", - "zh", - "hu", - "ko", - "ja", - ] - ) - tts_text = gr.Textbox( - label="Input Text.", - value="This model sounds really good and above all, it's reasonably fast.", - ) - tts_btn = gr.Button(value="Step 3 - Inference XTTS model") - - tts_output_audio = gr.Audio(label="Generated Audio.") - reference_audio = gr.Audio(label="Reference audio used.") + with gr.Row(): + with gr.Column() as col1: + xtts_checkpoint = gr.Textbox( + label="XTTS checkpoint path:", + value="", + ) + xtts_config = gr.Textbox( + label="XTTS config path:", + value="", + ) + xtts_vocab = gr.Textbox( + label="XTTS config path:", + value="", + ) + progress_load = gr.Label( + label="Progress:" + ) + load_btn = gr.Button(value="Step 3 - Load Fine tuned XTTS model") + + with gr.Column() as col2: + speaker_reference_audio = gr.Textbox( + label="Speaker reference audio:", + value="", + ) + tts_language = gr.Dropdown( + label="Language", + value="en", + choices=[ + "en", + "es", + "fr", + "de", + "it", + "pt", + "pl", + "tr", + "ru", + "nl", + "cs", + "ar", + "zh", + "hu", + "ko", + "ja", + ] + ) + tts_text = gr.Textbox( + label="Input Text.", + value="This model sounds really good and above all, it's reasonably fast.", + ) + tts_btn = gr.Button(value="Step 4 - Inference") + + with gr.Column() as col3: + tts_output_audio = gr.Audio(label="Generated Audio.") + reference_audio = gr.Audio(label="Reference audio used.") prompt_compute_btn.click( fn=preprocess_dataset, inputs=[ upload_file, lang, + out_path, ], outputs=[ progress_data, @@ -255,7 +269,6 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, output_pa eval_csv, ], ) - train_btn.click( @@ -266,19 +279,26 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, output_pa eval_csv, num_epochs, batch_size, + out_path, ], outputs=[progress_train, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio], ) + load_btn.click( + fn=load_model, + inputs=[ + xtts_checkpoint, + xtts_config, + xtts_vocab + ], + outputs=[progress_load], + ) tts_btn.click( fn=run_tts, inputs=[ tts_language, tts_text, - xtts_checkpoint, - xtts_config, - xtts_vocab, speaker_reference_audio, ], outputs=[tts_output_audio, reference_audio], From 3fc2880127ca5baadf65da633fd6e4c6424fe4e2 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 24 Nov 2023 10:25:24 -0300 Subject: [PATCH 06/22] Convert stereo to mono --- TTS/demos/xtts_ft_demo/utils/formatter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/TTS/demos/xtts_ft_demo/utils/formatter.py b/TTS/demos/xtts_ft_demo/utils/formatter.py index 03db6c2ca3..6497b0d792 100644 --- a/TTS/demos/xtts_ft_demo/utils/formatter.py +++ b/TTS/demos/xtts_ft_demo/utils/formatter.py @@ -62,6 +62,10 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0 for audio_path in tqdm_object: wav, sr = torchaudio.load(audio_path) + # stereo to mono if needed + if wav.size(0) != 1: + wav = torch.mean(wav, dim=0, keepdim=True) + wav = wav.squeeze() segments, info = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language) segments = list(segments) From af74cd442675b89293327bad5a55f643eef38cdd Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 24 Nov 2023 12:07:00 -0300 Subject: [PATCH 07/22] Bug fix on XTTS inference --- TTS/tts/models/xtts.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 208ec4d561..6b8cc59101 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -272,6 +272,11 @@ def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int = style_embs = [] for i in range(0, audio.shape[1], 22050 * chunk_length): audio_chunk = audio[:, i : i + 22050 * chunk_length] + + # if the chunk is too short ignore it + if audio_chunk.size(-1) < 22050 * 0.33: + continue + mel_chunk = wav_to_mel_cloning( audio_chunk, mel_norms=self.mel_stats.cpu(), From 8967fc7ef2de54c3c00e532753d72e66cb1406c7 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 24 Nov 2023 14:26:26 -0300 Subject: [PATCH 08/22] Update gradio demo --- TTS/demos/xtts_ft_demo/utils/formatter.py | 17 +++++++++------ TTS/demos/xtts_ft_demo/utils/gpt_train.py | 3 +++ TTS/demos/xtts_ft_demo/xtts_demo.py | 26 ++++++++++++++++++----- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/TTS/demos/xtts_ft_demo/utils/formatter.py b/TTS/demos/xtts_ft_demo/utils/formatter.py index 6497b0d792..e49d2426a7 100644 --- a/TTS/demos/xtts_ft_demo/utils/formatter.py +++ b/TTS/demos/xtts_ft_demo/utils/formatter.py @@ -44,6 +44,7 @@ def list_files(basePath, validExts=None, contains=None): yield audioPath def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0.2, eval_percentage=0.15, speaker_name="coqui", gradio_progress=None): + audio_total_size = 0 # make sure that ooutput file exists os.makedirs(out_path, exist_ok=True) @@ -67,7 +68,9 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0 wav = torch.mean(wav, dim=0, keepdim=True) wav = wav.squeeze() - segments, info = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language) + audio_total_size += (wav.size(-1) / sr) + + segments, _ = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language) segments = list(segments) i = 0 sentence = "" @@ -101,9 +104,9 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0 sentence = sentence[1:] # Expand number and abbreviations plus normalization sentence = multilingual_cleaners(sentence, target_language) - audio_file_name, ext = os.path.splitext(os.path.basename(audio_path)) + audio_file_name, _ = os.path.splitext(os.path.basename(audio_path)) - audio_file = f"wavs/{audio_file_name}_{str(i).zfill(8)}{ext}" + audio_file = f"wavs/{audio_file_name}_{str(i).zfill(8)}.wav" # Check for the next word's existence if word_idx + 1 < len(words_list): @@ -125,8 +128,7 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0 if audio.size(-1) >= sr/3: torchaudio.save(absoulte_path, audio, - sr, - backend="sox", + sr ) else: continue @@ -150,4 +152,7 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0 df_eval = df_eval.sort_values('audio_file') df_eval.to_csv(eval_metadata_path, sep="|", index=False) - return train_metadata_path, eval_metadata_path \ No newline at end of file + # deallocate VRAM + del asr_model + + return train_metadata_path, eval_metadata_path, audio_total_size \ No newline at end of file diff --git a/TTS/demos/xtts_ft_demo/utils/gpt_train.py b/TTS/demos/xtts_ft_demo/utils/gpt_train.py index 1e7d5f367e..4d33d6fc3b 100644 --- a/TTS/demos/xtts_ft_demo/utils/gpt_train.py +++ b/TTS/demos/xtts_ft_demo/utils/gpt_train.py @@ -164,4 +164,7 @@ def train_gpt(language, num_epochs, batch_size, train_csv, eval_csv, output_path longest_text_idx = samples_len.index(max(samples_len)) speaker_ref = train_samples[longest_text_idx]["audio_file"] + # deallocate VRAM + del model, trainer + return XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer.output_path, speaker_ref diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py index 9dcaefceb9..24a449ece4 100644 --- a/TTS/demos/xtts_ft_demo/xtts_demo.py +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -9,17 +9,23 @@ import os import torch import torchaudio -from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list, list_audios +from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import Xtts +def clear_gpu_cache(): + # clear the GPU cache + if torch.cuda.is_available(): + torch.cuda.empty_cache() + PORT = 5003 XTTS_MODEL = None def load_model(xtts_checkpoint, xtts_config, xtts_vocab): + clear_gpu_cache() global XTTS_MODEL config = XttsConfig() config.load_json(xtts_config) @@ -144,13 +150,23 @@ def read_logs(): prompt_compute_btn = gr.Button(value="Step 1 - Create dataset.") def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(track_tqdm=True)): + clear_gpu_cache() out_path = os.path.join(out_path, "dataset") os.makedirs(out_path, exist_ok=True) if audio_path is None: # ToDo: raise an error pass else: - train_meta, eval_meta = format_audio_list(audio_path, target_language=language, out_path=out_path, gradio_progress=progress) + train_meta, eval_meta, audio_total_size = format_audio_list(audio_path, target_language=language, out_path=out_path, gradio_progress=progress) + + clear_gpu_cache() + + # if audio total len is less than 2 minutes raise an error + if audio_total_size < 120: + message = "The sum of the duration of the audios that you provided should be at least 2 minutes!" + print(message) + return message, " ", " " + print("Dataset Processed!") return "Dataset Processed!", train_meta, eval_meta @@ -173,7 +189,7 @@ def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(trac minimum=2, maximum=512, step=1, - value=15, + value=16, ) progress_train = gr.Label( label="Progress:" @@ -186,8 +202,7 @@ def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(trac train_btn = gr.Button(value="Step 2 - Run the training") def train_model(language, train_csv, eval_csv, num_epochs, batch_size, output_path, progress=gr.Progress(track_tqdm=True)): - # train_csv = '/tmp/tmprh4k_vou/metadata_train.csv' - # eval_csv = '/tmp/tmprh4k_vou/metadata_eval.csv' + clear_gpu_cache() config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, train_csv, eval_csv, output_path=output_path) # copy original files to avoid parameters changes issues @@ -196,6 +211,7 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, output_pa ft_xtts_checkpoint = os.path.join(exp_path, "best_model.pth") print("Model training done!") + clear_gpu_cache() return "Model training done!", config_path, vocab_file, ft_xtts_checkpoint, speaker_wav From c76fb856d1d3540e8c0c30714d91ae8113507faf Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 24 Nov 2023 15:40:35 -0300 Subject: [PATCH 09/22] Update gradio demo --- TTS/demos/xtts_ft_demo/utils/gpt_train.py | 2 +- TTS/demos/xtts_ft_demo/xtts_demo.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/demos/xtts_ft_demo/utils/gpt_train.py b/TTS/demos/xtts_ft_demo/utils/gpt_train.py index 4d33d6fc3b..2c51e4362e 100644 --- a/TTS/demos/xtts_ft_demo/utils/gpt_train.py +++ b/TTS/demos/xtts_ft_demo/utils/gpt_train.py @@ -23,7 +23,7 @@ def train_gpt(language, num_epochs, batch_size, train_csv, eval_csv, output_path OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False START_WITH_EVAL = True # if True it will star with evaluation BATCH_SIZE = batch_size # set here the batch size - GRAD_ACUMM_STEPS = 1 # set here the grad accumulation steps + GRAD_ACUMM_STEPS = 4 # set here the grad accumulation steps # Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly. diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py index 24a449ece4..69d2dd6014 100644 --- a/TTS/demos/xtts_ft_demo/xtts_demo.py +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -189,7 +189,7 @@ def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(trac minimum=2, maximum=512, step=1, - value=16, + value=4, ) progress_train = gr.Label( label="Progress:" From 70f2cb9c0ed7a7c6452a89e09d1bf055d818038f Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 24 Nov 2023 15:53:34 -0300 Subject: [PATCH 10/22] Update gradio demo --- TTS/demos/xtts_ft_demo/utils/gpt_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/demos/xtts_ft_demo/utils/gpt_train.py b/TTS/demos/xtts_ft_demo/utils/gpt_train.py index 2c51e4362e..4d33d6fc3b 100644 --- a/TTS/demos/xtts_ft_demo/utils/gpt_train.py +++ b/TTS/demos/xtts_ft_demo/utils/gpt_train.py @@ -23,7 +23,7 @@ def train_gpt(language, num_epochs, batch_size, train_csv, eval_csv, output_path OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False START_WITH_EVAL = True # if True it will star with evaluation BATCH_SIZE = batch_size # set here the batch size - GRAD_ACUMM_STEPS = 4 # set here the grad accumulation steps + GRAD_ACUMM_STEPS = 1 # set here the grad accumulation steps # Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly. From 335b8c37b39d53c168f962c69cb51f81e14c5cfa Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 24 Nov 2023 16:31:14 -0300 Subject: [PATCH 11/22] Update gradio demo --- TTS/demos/xtts_ft_demo/utils/gpt_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/demos/xtts_ft_demo/utils/gpt_train.py b/TTS/demos/xtts_ft_demo/utils/gpt_train.py index 4d33d6fc3b..a80370bc80 100644 --- a/TTS/demos/xtts_ft_demo/utils/gpt_train.py +++ b/TTS/demos/xtts_ft_demo/utils/gpt_train.py @@ -165,6 +165,6 @@ def train_gpt(language, num_epochs, batch_size, train_csv, eval_csv, output_path speaker_ref = train_samples[longest_text_idx]["audio_file"] # deallocate VRAM - del model, trainer + del model return XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer.output_path, speaker_ref From eaa5355c91ba6949d9a1e79d57a3392750f4d9d1 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Mon, 27 Nov 2023 10:01:48 -0300 Subject: [PATCH 12/22] Add parameters to be able to set then on colab demo --- TTS/demos/xtts_ft_demo/utils/gpt_train.py | 10 +- TTS/demos/xtts_ft_demo/xtts_demo.py | 492 ++++++++++++---------- 2 files changed, 274 insertions(+), 228 deletions(-) diff --git a/TTS/demos/xtts_ft_demo/utils/gpt_train.py b/TTS/demos/xtts_ft_demo/utils/gpt_train.py index a80370bc80..54fade3823 100644 --- a/TTS/demos/xtts_ft_demo/utils/gpt_train.py +++ b/TTS/demos/xtts_ft_demo/utils/gpt_train.py @@ -8,9 +8,9 @@ from TTS.utils.manage import ModelManager -def train_gpt(language, num_epochs, batch_size, train_csv, eval_csv, output_path): +def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path): # Logging parameters - RUN_NAME = "GPT_XTTSv2.1_FT" + RUN_NAME = "GPT_XTTS_FT" PROJECT_NAME = "XTTS_trainer" DASHBOARD_LOGGER = "tensorboard" LOGGER_URI = None @@ -18,13 +18,11 @@ def train_gpt(language, num_epochs, batch_size, train_csv, eval_csv, output_path # Set here the path that the checkpoints will be saved. Default: ./run/training/ OUT_PATH = os.path.join(output_path, "run", "training") - # Training Parameters OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False - START_WITH_EVAL = True # if True it will star with evaluation + START_WITH_EVAL = False # if True it will star with evaluation BATCH_SIZE = batch_size # set here the batch size - GRAD_ACUMM_STEPS = 1 # set here the grad accumulation steps - # Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly. + GRAD_ACUMM_STEPS = grad_acumm # set here the grad accumulation steps # Define here the dataset that you want to use for the fine-tuning on. diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py index 69d2dd6014..662c9c9f78 100644 --- a/TTS/demos/xtts_ft_demo/xtts_demo.py +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -1,3 +1,4 @@ +import argparse import os import sys import tempfile @@ -21,7 +22,6 @@ def clear_gpu_cache(): if torch.cuda.is_available(): torch.cuda.empty_cache() -PORT = 5003 XTTS_MODEL = None def load_model(xtts_checkpoint, xtts_config, xtts_vocab): @@ -101,231 +101,279 @@ def read_logs(): return f.read() -with gr.Blocks() as demo: - with gr.Tab("Data processing"): - out_path = gr.Textbox( - label="Output path (where data and checkpoints will be saved):", - value="/tmp/xtts_ft/" - ) - # upload_file = gr.Audio( - # sources="upload", - # label="Select here the audio files that you want to use for XTTS trainining !", - # type="filepath", - # ) - upload_file = gr.File( - file_count="multiple", - label="Select here the audio files that you want to use for XTTS trainining (Supported formats: wav, mp3, and flac)", - ) - lang = gr.Dropdown( - label="Dataset Language", - value="en", - choices=[ - "en", - "es", - "fr", - "de", - "it", - "pt", - "pl", - "tr", - "ru", - "nl", - "cs", - "ar", - "zh", - "hu", - "ko", - "ja" - ], - ) - progress_data = gr.Label( - label="Progress:" - ) - logs = gr.Textbox( - label="Logs:", - interactive=False, - ) - demo.load(read_logs, None, logs, every=1) - - prompt_compute_btn = gr.Button(value="Step 1 - Create dataset.") - - def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(track_tqdm=True)): - clear_gpu_cache() - out_path = os.path.join(out_path, "dataset") - os.makedirs(out_path, exist_ok=True) - if audio_path is None: - # ToDo: raise an error - pass - else: - train_meta, eval_meta, audio_total_size = format_audio_list(audio_path, target_language=language, out_path=out_path, gradio_progress=progress) - - clear_gpu_cache() - - # if audio total len is less than 2 minutes raise an error - if audio_total_size < 120: - message = "The sum of the duration of the audios that you provided should be at least 2 minutes!" - print(message) - return message, " ", " " - - print("Dataset Processed!") - return "Dataset Processed!", train_meta, eval_meta - - with gr.Tab("Fine-tuning XTTS Encoder"): - train_csv = gr.Textbox( - label="Train CSV:", - ) - eval_csv = gr.Textbox( - label="Eval CSV:", - ) - num_epochs = gr.Slider( - label="num_epochs", - minimum=1, - maximum=100, - step=1, - value=10, - ) - batch_size = gr.Slider( - label="batch_size", - minimum=2, - maximum=512, - step=1, - value=4, - ) - progress_train = gr.Label( - label="Progress:" - ) - logs_tts_train = gr.Textbox( - label="Logs:", - interactive=False, - ) - demo.load(read_logs, None, logs_tts_train, every=1) - train_btn = gr.Button(value="Step 2 - Run the training") - - def train_model(language, train_csv, eval_csv, num_epochs, batch_size, output_path, progress=gr.Progress(track_tqdm=True)): - clear_gpu_cache() - - config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, train_csv, eval_csv, output_path=output_path) - # copy original files to avoid parameters changes issues - os.system(f"cp {config_path} {exp_path}") - os.system(f"cp {vocab_file} {exp_path}") - - ft_xtts_checkpoint = os.path.join(exp_path, "best_model.pth") - print("Model training done!") - clear_gpu_cache() - return "Model training done!", config_path, vocab_file, ft_xtts_checkpoint, speaker_wav - - - with gr.Tab("Inference"): - with gr.Row(): - with gr.Column() as col1: - xtts_checkpoint = gr.Textbox( - label="XTTS checkpoint path:", - value="", - ) - xtts_config = gr.Textbox( - label="XTTS config path:", - value="", - ) - xtts_vocab = gr.Textbox( - label="XTTS config path:", - value="", - ) - progress_load = gr.Label( - label="Progress:" - ) - load_btn = gr.Button(value="Step 3 - Load Fine tuned XTTS model") - - with gr.Column() as col2: - speaker_reference_audio = gr.Textbox( - label="Speaker reference audio:", - value="", - ) - tts_language = gr.Dropdown( - label="Language", - value="en", - choices=[ - "en", - "es", - "fr", - "de", - "it", - "pt", - "pl", - "tr", - "ru", - "nl", - "cs", - "ar", - "zh", - "hu", - "ko", - "ja", - ] - ) - tts_text = gr.Textbox( - label="Input Text.", - value="This model sounds really good and above all, it's reasonably fast.", - ) - tts_btn = gr.Button(value="Step 4 - Inference") - - with gr.Column() as col3: - tts_output_audio = gr.Audio(label="Generated Audio.") - reference_audio = gr.Audio(label="Reference audio used.") - - prompt_compute_btn.click( - fn=preprocess_dataset, - inputs=[ - upload_file, - lang, - out_path, - ], - outputs=[ - progress_data, - train_csv, - eval_csv, - ], - ) - - - train_btn.click( - fn=train_model, - inputs=[ - lang, - train_csv, - eval_csv, - num_epochs, - batch_size, - out_path, - ], - outputs=[progress_train, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio], - ) - - load_btn.click( - fn=load_model, - inputs=[ - xtts_checkpoint, - xtts_config, - xtts_vocab - ], - outputs=[progress_load], - ) - - tts_btn.click( - fn=run_tts, - inputs=[ - tts_language, - tts_text, - speaker_reference_audio, - ], - outputs=[tts_output_audio, reference_audio], - ) +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="""XTTS fine-tuning demo\n\n""" + """ + Example runs: + python3 TTS/demos/xtts_ft_demo/xtts_demo.py --port + """, + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + "--port", + type=int, + help="Port to run the gradio demo. Default: 5003", + default=5003, + ) + parser.add_argument( + "--out_path", + type=str, + help="Output path (where data and checkpoints will be saved) Default: /tmp/xtts_ft/", + default="/tmp/xtts_ft/", + ) + parser.add_argument( + "--num_epochs", + type=int, + help="Number of epochs to train. Default: 10", + default=10, + ) + parser.add_argument( + "--batch_size", + type=int, + help="Batch size. Default: 4", + default=4, + ) + parser.add_argument( + "--grad_acumm", + type=int, + help="Grad accumulation steps. Default: 1", + default=1, + ) + args = parser.parse_args() + + with gr.Blocks() as demo: + with gr.Tab("Data processing"): + out_path = gr.Textbox( + label="Output path (where data and checkpoints will be saved):", + value=args.out_path, + ) + # upload_file = gr.Audio( + # sources="upload", + # label="Select here the audio files that you want to use for XTTS trainining !", + # type="filepath", + # ) + upload_file = gr.File( + file_count="multiple", + label="Select here the audio files that you want to use for XTTS trainining (Supported formats: wav, mp3, and flac)", + ) + lang = gr.Dropdown( + label="Dataset Language", + value="en", + choices=[ + "en", + "es", + "fr", + "de", + "it", + "pt", + "pl", + "tr", + "ru", + "nl", + "cs", + "ar", + "zh", + "hu", + "ko", + "ja" + ], + ) + progress_data = gr.Label( + label="Progress:" + ) + logs = gr.Textbox( + label="Logs:", + interactive=False, + ) + demo.load(read_logs, None, logs, every=1) + + prompt_compute_btn = gr.Button(value="Step 1 - Create dataset.") + + def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(track_tqdm=True)): + clear_gpu_cache() + out_path = os.path.join(out_path, "dataset") + os.makedirs(out_path, exist_ok=True) + if audio_path is None: + # ToDo: raise an error + pass + else: + train_meta, eval_meta, audio_total_size = format_audio_list(audio_path, target_language=language, out_path=out_path, gradio_progress=progress) + + clear_gpu_cache() + + # if audio total len is less than 2 minutes raise an error + if audio_total_size < 120: + message = "The sum of the duration of the audios that you provided should be at least 2 minutes!" + print(message) + return message, " ", " " + + print("Dataset Processed!") + return "Dataset Processed!", train_meta, eval_meta + + with gr.Tab("Fine-tuning XTTS Encoder"): + train_csv = gr.Textbox( + label="Train CSV:", + ) + eval_csv = gr.Textbox( + label="Eval CSV:", + ) + num_epochs = gr.Slider( + label="Number of epochs:", + minimum=1, + maximum=100, + step=1, + value=args.num_epochs, + ) + batch_size = gr.Slider( + label="Batch size:", + minimum=2, + maximum=512, + step=1, + value=args.batch_size, + ) + grad_acumm = gr.Slider( + label="Grad accumulation steps:", + minimum=2, + maximum=128, + step=1, + value=args.grad_acumm, + ) + progress_train = gr.Label( + label="Progress:" + ) + logs_tts_train = gr.Textbox( + label="Logs:", + interactive=False, + ) + demo.load(read_logs, None, logs_tts_train, every=1) + train_btn = gr.Button(value="Step 2 - Run the training") + + def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path): + clear_gpu_cache() + + config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path=output_path) + # copy original files to avoid parameters changes issues + os.system(f"cp {config_path} {exp_path}") + os.system(f"cp {vocab_file} {exp_path}") + + ft_xtts_checkpoint = os.path.join(exp_path, "best_model.pth") + print("Model training done!") + clear_gpu_cache() + return "Model training done!", config_path, vocab_file, ft_xtts_checkpoint, speaker_wav + + + with gr.Tab("Inference"): + with gr.Row(): + with gr.Column() as col1: + xtts_checkpoint = gr.Textbox( + label="XTTS checkpoint path:", + value="", + ) + xtts_config = gr.Textbox( + label="XTTS config path:", + value="", + ) + xtts_vocab = gr.Textbox( + label="XTTS config path:", + value="", + ) + progress_load = gr.Label( + label="Progress:" + ) + load_btn = gr.Button(value="Step 3 - Load Fine tuned XTTS model") + + with gr.Column() as col2: + speaker_reference_audio = gr.Textbox( + label="Speaker reference audio:", + value="", + ) + tts_language = gr.Dropdown( + label="Language", + value="en", + choices=[ + "en", + "es", + "fr", + "de", + "it", + "pt", + "pl", + "tr", + "ru", + "nl", + "cs", + "ar", + "zh", + "hu", + "ko", + "ja", + ] + ) + tts_text = gr.Textbox( + label="Input Text.", + value="This model sounds really good and above all, it's reasonably fast.", + ) + tts_btn = gr.Button(value="Step 4 - Inference") + + with gr.Column() as col3: + tts_output_audio = gr.Audio(label="Generated Audio.") + reference_audio = gr.Audio(label="Reference audio used.") + + prompt_compute_btn.click( + fn=preprocess_dataset, + inputs=[ + upload_file, + lang, + out_path, + ], + outputs=[ + progress_data, + train_csv, + eval_csv, + ], + ) + + + train_btn.click( + fn=train_model, + inputs=[ + lang, + train_csv, + eval_csv, + num_epochs, + batch_size, + grad_acumm, + out_path, + ], + outputs=[progress_train, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio], + ) + + load_btn.click( + fn=load_model, + inputs=[ + xtts_checkpoint, + xtts_config, + xtts_vocab + ], + outputs=[progress_load], + ) + + tts_btn.click( + fn=run_tts, + inputs=[ + tts_language, + tts_text, + speaker_reference_audio, + ], + outputs=[tts_output_audio, reference_audio], + ) -if __name__ == "__main__": demo.launch( share=True, - debug=True, - server_port=PORT, + debug=False, + server_port=args.port, server_name="0.0.0.0" ) From c5cb7eb791fb6331efbd946329892c1ec8319ee3 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Mon, 27 Nov 2023 10:41:09 -0300 Subject: [PATCH 13/22] Add erros messages --- TTS/demos/xtts_ft_demo/xtts_demo.py | 32 +++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py index 662c9c9f78..6115c55235 100644 --- a/TTS/demos/xtts_ft_demo/xtts_demo.py +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -10,6 +10,7 @@ import os import torch import torchaudio +import traceback from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt @@ -22,11 +23,12 @@ def clear_gpu_cache(): if torch.cuda.is_available(): torch.cuda.empty_cache() - XTTS_MODEL = None def load_model(xtts_checkpoint, xtts_config, xtts_vocab): - clear_gpu_cache() global XTTS_MODEL + clear_gpu_cache() + if not xtts_checkpoint or not xtts_config or not xtts_vocab: + return "You need to run the previous steps or manually set the `XTTS checkpoint path`, `XTTS config path`, and `XTTS vocab path` fields !!" config = XttsConfig() config.load_json(xtts_config) XTTS_MODEL = Xtts.init_from_config(config) @@ -39,6 +41,9 @@ def load_model(xtts_checkpoint, xtts_config, xtts_vocab): return "Model Loaded!" def run_tts(lang, tts_text, speaker_audio_file): + if XTTS_MODEL is None or not speaker_audio_file: + return "You need to run the previous step to load the model !!", None, None + gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs) out = XTTS_MODEL.inference( text=tts_text, @@ -57,7 +62,7 @@ def run_tts(lang, tts_text, speaker_audio_file): out_path = fp.name torchaudio.save(out_path, out["wav"], 24000) - return out_path, speaker_audio_file + return "Speech generated !", out_path, speaker_audio_file @@ -197,8 +202,7 @@ def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(trac out_path = os.path.join(out_path, "dataset") os.makedirs(out_path, exist_ok=True) if audio_path is None: - # ToDo: raise an error - pass + return "You should provide one or multiple audio files! If you provided it, probably the upload of the files is not finished yet!", "", "" else: train_meta, eval_meta, audio_total_size = format_audio_list(audio_path, target_language=language, out_path=out_path, gradio_progress=progress) @@ -208,7 +212,7 @@ def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(trac if audio_total_size < 120: message = "The sum of the duration of the audios that you provided should be at least 2 minutes!" print(message) - return message, " ", " " + return message, "", "" print("Dataset Processed!") return "Dataset Processed!", train_meta, eval_meta @@ -253,8 +257,14 @@ def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(trac def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path): clear_gpu_cache() + if not train_csv or not eval_csv: + return "You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !", "", "", "", "" + try: + config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path=output_path) + except Exception as e: + traceback.print_exc() + return f"The training was interrupted due an error !! Please check the console to check the error message! Error summary: {e}", "", "", "", "" - config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path=output_path) # copy original files to avoid parameters changes issues os.system(f"cp {config_path} {exp_path}") os.system(f"cp {vocab_file} {exp_path}") @@ -276,8 +286,9 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acum label="XTTS config path:", value="", ) + xtts_vocab = gr.Textbox( - label="XTTS config path:", + label="XTTS vocab path:", value="", ) progress_load = gr.Label( @@ -319,6 +330,9 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acum tts_btn = gr.Button(value="Step 4 - Inference") with gr.Column() as col3: + progress_gen = gr.Label( + label="Progress:" + ) tts_output_audio = gr.Audio(label="Generated Audio.") reference_audio = gr.Audio(label="Reference audio used.") @@ -368,7 +382,7 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acum tts_text, speaker_reference_audio, ], - outputs=[tts_output_audio, reference_audio], + outputs=[progress_gen, tts_output_audio, reference_audio], ) demo.launch( From e6c51e366651b342cfa1cbc88a32ff52bfb425cb Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Mon, 27 Nov 2023 10:53:43 -0300 Subject: [PATCH 14/22] Add intuitive error messages --- TTS/demos/xtts_ft_demo/xtts_demo.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py index 6115c55235..ee3eedbe2b 100644 --- a/TTS/demos/xtts_ft_demo/xtts_demo.py +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -204,7 +204,12 @@ def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(trac if audio_path is None: return "You should provide one or multiple audio files! If you provided it, probably the upload of the files is not finished yet!", "", "" else: - train_meta, eval_meta, audio_total_size = format_audio_list(audio_path, target_language=language, out_path=out_path, gradio_progress=progress) + try: + train_meta, eval_meta, audio_total_size = format_audio_list(audio_path, target_language=language, out_path=out_path, gradio_progress=progress) + except: + traceback.print_exc() + error = traceback.format_exc() + return f"The data processing was interrupted due an error !! Please check the console to verify the full error message! \n Error summary: {error}", "", "" clear_gpu_cache() @@ -261,9 +266,10 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acum return "You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !", "", "", "", "" try: config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path=output_path) - except Exception as e: + except: traceback.print_exc() - return f"The training was interrupted due an error !! Please check the console to check the error message! Error summary: {e}", "", "", "", "" + error = traceback.format_exc() + return f"The training was interrupted due an error !! Please check the console to check the full error message! \n Error summary: {error}", "", "", "", "" # copy original files to avoid parameters changes issues os.system(f"cp {config_path} {exp_path}") From ceb8b05abe8c90b6643b1a348ecf9e841fa96baa Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Mon, 27 Nov 2023 11:16:41 -0300 Subject: [PATCH 15/22] Update --- TTS/demos/xtts_ft_demo/xtts_demo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py index ee3eedbe2b..43448adca3 100644 --- a/TTS/demos/xtts_ft_demo/xtts_demo.py +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -150,7 +150,7 @@ def read_logs(): args = parser.parse_args() with gr.Blocks() as demo: - with gr.Tab("Data processing"): + with gr.Tab("1 - Data processing"): out_path = gr.Textbox( label="Output path (where data and checkpoints will be saved):", value=args.out_path, @@ -222,7 +222,7 @@ def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(trac print("Dataset Processed!") return "Dataset Processed!", train_meta, eval_meta - with gr.Tab("Fine-tuning XTTS Encoder"): + with gr.Tab("2 - Fine-tuning XTTS Encoder"): train_csv = gr.Textbox( label="Train CSV:", ) @@ -281,7 +281,7 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acum return "Model training done!", config_path, vocab_file, ft_xtts_checkpoint, speaker_wav - with gr.Tab("Inference"): + with gr.Tab("3 - Inference"): with gr.Row(): with gr.Column() as col1: xtts_checkpoint = gr.Textbox( From 1a60767d8396a9ca4f792aaeaf68f7eef40f0955 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Mon, 27 Nov 2023 12:10:43 -0300 Subject: [PATCH 16/22] Add max_audio_length parameter --- TTS/demos/xtts_ft_demo/utils/gpt_train.py | 4 ++-- TTS/demos/xtts_ft_demo/xtts_demo.py | 22 +++++++++++++++++++--- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/TTS/demos/xtts_ft_demo/utils/gpt_train.py b/TTS/demos/xtts_ft_demo/utils/gpt_train.py index 54fade3823..d6e2f31305 100644 --- a/TTS/demos/xtts_ft_demo/utils/gpt_train.py +++ b/TTS/demos/xtts_ft_demo/utils/gpt_train.py @@ -8,7 +8,7 @@ from TTS.utils.manage import ModelManager -def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path): +def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path, max_audio_length=255995): # Logging parameters RUN_NAME = "GPT_XTTS_FT" PROJECT_NAME = "XTTS_trainer" @@ -79,7 +79,7 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, max_conditioning_length=132300, # 6 secs min_conditioning_length=66150, # 3 secs debug_loading_failures=False, - max_wav_length=255995, # ~11.6 seconds + max_wav_length=max_audio_length, # ~11.6 seconds max_text_length=200, mel_norm_file=MEL_NORM_FILE, dvae_checkpoint=DVAE_CHECKPOINT, diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py index 43448adca3..8e9a88eb96 100644 --- a/TTS/demos/xtts_ft_demo/xtts_demo.py +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -147,6 +147,13 @@ def read_logs(): help="Grad accumulation steps. Default: 1", default=1, ) + parser.add_argument( + "--max_audio_length", + type=int, + help="Max permitted audio size in seconds. Default: 11", + default=11, + ) + args = parser.parse_args() with gr.Blocks() as demo: @@ -250,6 +257,13 @@ def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(trac step=1, value=args.grad_acumm, ) + max_audio_length = gr.Slider( + label="Max permitted audio size in seconds:", + minimum=2, + maximum=20, + step=1, + value=args.max_audio_length, + ) progress_train = gr.Label( label="Progress:" ) @@ -260,12 +274,14 @@ def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(trac demo.load(read_logs, None, logs_tts_train, every=1) train_btn = gr.Button(value="Step 2 - Run the training") - def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path): + def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path, max_audio_length): clear_gpu_cache() if not train_csv or not eval_csv: return "You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !", "", "", "", "" try: - config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path=output_path) + # convert seconds to waveform frames + max_audio_length = int(max_audio_length * 22050) + config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path=output_path, max_audio_length=max_audio_length) except: traceback.print_exc() error = traceback.format_exc() @@ -280,7 +296,6 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acum clear_gpu_cache() return "Model training done!", config_path, vocab_file, ft_xtts_checkpoint, speaker_wav - with gr.Tab("3 - Inference"): with gr.Row(): with gr.Column() as col1: @@ -367,6 +382,7 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acum batch_size, grad_acumm, out_path, + max_audio_length, ], outputs=[progress_train, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio], ) From 68964fca0d9bee907a1b27cf7a9de47b48d56b15 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 1 Dec 2023 09:13:34 -0300 Subject: [PATCH 17/22] Add XTTS fine-tuner docs --- docs/source/models/xtts.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md index 03e44af170..f42e8d8f0d 100644 --- a/docs/source/models/xtts.md +++ b/docs/source/models/xtts.md @@ -175,6 +175,32 @@ torchaudio.save("xtts_streaming.wav", wav.squeeze().unsqueeze(0).cpu(), 24000) ### Training +#### Easy training +To make `XTTS_v2` GPT encoder training easier for beginner users we did a gradio demo that implements the whole fine-tuning pipeline. The gradio demo enables the user to easily do the following steps: + +- Preprocessing of the uploaded audio or audio files in 🐸 TTS coqui formatter +- Train the XTTS GPT encoder with the processed data +- Inference support using the fine-tuned model + +The user can run this gradio demos locally or remotely using a Colab Notebook. + +##### Run demo on Colab +To make the `XTTS_v2` fine-tuning more accessible for users that do not have good GPUs available we did a Google Colab Notebook. + +The Colab Notebook is available [here](https://colab.research.google.com/drive/1GiI4_X724M8q2W-zZ-jXo7cWTV7RfaH-?usp=sharing). + +To learn how to use this Colab Notebook please check the [XTTS fine-tuning video](). + +##### Run demo locally + +To run the demo locally you need to do the following steps: +1. Install 🐸 TTS following the instructions available [here](https://tts.readthedocs.io/en/dev/installation.html#installation). +2. Install the gradio demo requirements with the command `python3 -m pip install -r TTS/demos/xtts_ft_demo/requirements.txt` +3. Run the gradio demo using the command `python3 TTS/demos/xtts_ft_demo/xtts_demo.py` +4. Follow the steps presented on the [XTTS fine-tuning video]() to be able to fine-tune and use the fine-tuned model. + +#### Advanced training + A recipe for `XTTS_v2` GPT encoder training using `LJSpeech` dataset is available at https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech/xtts_v1/train_gpt_xtts.py You need to change the fields of the `BaseDatasetConfig` to match your dataset and then update `GPTArgs` and `GPTTrainerConfig` fields as you need. By default, it will use the same parameters that XTTS v1.1 model was trained with. To speed up the model convergence, as default, it will also download the XTTS v1.1 checkpoint and load it. @@ -222,6 +248,7 @@ torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(out["wav"]).unsqueeze(0), 24000) ``` + ## References and Acknowledgements - VallE: https://arxiv.org/abs/2301.02111 - Tortoise Repo: https://github.com/neonbjb/tortoise-tts From 5dd217a759d6303b599a75985f3664cff1a36d8a Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 1 Dec 2023 09:47:09 -0300 Subject: [PATCH 18/22] Update XTTS finetuner docs --- TTS/demos/xtts_ft_demo/xtts_demo.py | 4 ++-- docs/source/models/xtts.md | 18 +++++++++++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py index 8e9a88eb96..ebb11f29d1 100644 --- a/TTS/demos/xtts_ft_demo/xtts_demo.py +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -202,7 +202,7 @@ def read_logs(): ) demo.load(read_logs, None, logs, every=1) - prompt_compute_btn = gr.Button(value="Step 1 - Create dataset.") + prompt_compute_btn = gr.Button(value="Step 1 - Create dataset") def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(track_tqdm=True)): clear_gpu_cache() @@ -315,7 +315,7 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acum progress_load = gr.Label( label="Progress:" ) - load_btn = gr.Button(value="Step 3 - Load Fine tuned XTTS model") + load_btn = gr.Button(value="Step 3 - Load Fine-tuned XTTS model") with gr.Column() as col2: speaker_reference_audio = gr.Textbox( diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md index f42e8d8f0d..92a981d765 100644 --- a/docs/source/models/xtts.md +++ b/docs/source/models/xtts.md @@ -182,7 +182,7 @@ To make `XTTS_v2` GPT encoder training easier for beginner users we did a gradio - Train the XTTS GPT encoder with the processed data - Inference support using the fine-tuned model -The user can run this gradio demos locally or remotely using a Colab Notebook. +The user can run this gradio demo locally or remotely using a Colab Notebook. ##### Run demo on Colab To make the `XTTS_v2` fine-tuning more accessible for users that do not have good GPUs available we did a Google Colab Notebook. @@ -191,6 +191,15 @@ The Colab Notebook is available [here](https://colab.research.google.com/drive/1 To learn how to use this Colab Notebook please check the [XTTS fine-tuning video](). +If you are not able to acess the video you need to follow the steps: + +1. Open the Colab notebook and start the demo by runining the first two cells (ignore pip install errors in the first one). +2. Click on the link "Running on public URL:" on the second cell output. +3. On the first Tab (1 - Data processing) you need to select the audio file or files, wait for upload, and then click on the button "Step 1 - Create dataset" and then wait until the dataset processing is done. +4. Soon as the dataset processing is done you need to go to the second Tab (2 - Fine-tuning XTTS Encoder) and press the button "Step 2 - Run the training" and then wait until the training is finished. Note that it can take up to 40 minutes. +5. Soon the training is done you can go to the third Tab (3 - Inference) and then click on the button "Step 3 - Load Fine-tuned XTTS model" and wait until the fine-tuned model is loaded. Then you can do the inference on the model by clicking on the button "Step 4 - Inference". + + ##### Run demo locally To run the demo locally you need to do the following steps: @@ -199,6 +208,13 @@ To run the demo locally you need to do the following steps: 3. Run the gradio demo using the command `python3 TTS/demos/xtts_ft_demo/xtts_demo.py` 4. Follow the steps presented on the [XTTS fine-tuning video]() to be able to fine-tune and use the fine-tuned model. + +If you are not able to acess the video you need to follow the steps: + +1. On the first Tab (1 - Data processing) you need to select the audio file or files, wait for upload, and then click on the button "Step 1 - Create dataset" and then wait until the dataset processing is done. +2. Soon as the dataset processing is done you need to go to the second Tab (2 - Fine-tuning XTTS Encoder) and press the button "Step 2 - Run the training" and then wait until the training is finished. Note that it can take up to 40 minutes. +3. Soon the training is done you can go to the third Tab (3 - Inference) and then click on the button "Step 3 - Load Fine-tuned XTTS model" and wait until the fine-tuned model is loaded. Then you can do the inference on the model by clicking on the button "Step 4 - Inference". + #### Advanced training A recipe for `XTTS_v2` GPT encoder training using `LJSpeech` dataset is available at https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech/xtts_v1/train_gpt_xtts.py From eb18b27afc7870718ebd06d02db0655c48502f7e Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 1 Dec 2023 14:07:33 -0300 Subject: [PATCH 19/22] Delete trainer to freeze memory --- TTS/demos/xtts_ft_demo/utils/gpt_train.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/TTS/demos/xtts_ft_demo/utils/gpt_train.py b/TTS/demos/xtts_ft_demo/utils/gpt_train.py index d6e2f31305..f0df162421 100644 --- a/TTS/demos/xtts_ft_demo/utils/gpt_train.py +++ b/TTS/demos/xtts_ft_demo/utils/gpt_train.py @@ -162,7 +162,9 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, longest_text_idx = samples_len.index(max(samples_len)) speaker_ref = train_samples[longest_text_idx]["audio_file"] + trainer_out_path = trainer.output_path + # deallocate VRAM - del model + del model, trainer - return XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer.output_path, speaker_ref + return XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer_out_path, speaker_ref From 490af290d33b4011c252a9ec07839983656dfa2f Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 1 Dec 2023 15:21:33 -0300 Subject: [PATCH 20/22] Delete unused variables --- TTS/demos/xtts_ft_demo/utils/formatter.py | 2 +- TTS/demos/xtts_ft_demo/utils/gpt_train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/demos/xtts_ft_demo/utils/formatter.py b/TTS/demos/xtts_ft_demo/utils/formatter.py index e49d2426a7..42f12f7277 100644 --- a/TTS/demos/xtts_ft_demo/utils/formatter.py +++ b/TTS/demos/xtts_ft_demo/utils/formatter.py @@ -153,6 +153,6 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0 df_eval.to_csv(eval_metadata_path, sep="|", index=False) # deallocate VRAM - del asr_model + del asr_model, df_train, df_eval, df, metadata return train_metadata_path, eval_metadata_path, audio_total_size \ No newline at end of file diff --git a/TTS/demos/xtts_ft_demo/utils/gpt_train.py b/TTS/demos/xtts_ft_demo/utils/gpt_train.py index f0df162421..7212ac38d7 100644 --- a/TTS/demos/xtts_ft_demo/utils/gpt_train.py +++ b/TTS/demos/xtts_ft_demo/utils/gpt_train.py @@ -165,6 +165,6 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, trainer_out_path = trainer.output_path # deallocate VRAM - del model, trainer + del model, trainer, train_samples, eval_samples return XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer_out_path, speaker_ref From e9a2c0606a829cf9345092c98bf1785132e0b3e3 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 1 Dec 2023 15:37:09 -0300 Subject: [PATCH 21/22] Add gc.collect() --- TTS/demos/xtts_ft_demo/utils/formatter.py | 4 +++- TTS/demos/xtts_ft_demo/utils/gpt_train.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/TTS/demos/xtts_ft_demo/utils/formatter.py b/TTS/demos/xtts_ft_demo/utils/formatter.py index 42f12f7277..536faa0108 100644 --- a/TTS/demos/xtts_ft_demo/utils/formatter.py +++ b/TTS/demos/xtts_ft_demo/utils/formatter.py @@ -1,4 +1,5 @@ import os +import gc import torchaudio import pandas from faster_whisper import WhisperModel @@ -152,7 +153,8 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0 df_eval = df_eval.sort_values('audio_file') df_eval.to_csv(eval_metadata_path, sep="|", index=False) - # deallocate VRAM + # deallocate VRAM and RAM del asr_model, df_train, df_eval, df, metadata + gc.collect() return train_metadata_path, eval_metadata_path, audio_total_size \ No newline at end of file diff --git a/TTS/demos/xtts_ft_demo/utils/gpt_train.py b/TTS/demos/xtts_ft_demo/utils/gpt_train.py index 7212ac38d7..a98765c3e7 100644 --- a/TTS/demos/xtts_ft_demo/utils/gpt_train.py +++ b/TTS/demos/xtts_ft_demo/utils/gpt_train.py @@ -1,4 +1,5 @@ import os +import gc from trainer import Trainer, TrainerArgs @@ -164,7 +165,8 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, trainer_out_path = trainer.output_path - # deallocate VRAM + # deallocate VRAM and RAM del model, trainer, train_samples, eval_samples + gc.collect() return XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer_out_path, speaker_ref From 1936330adaad84812b0fafd2aa17cb7bba6edea9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 1 Dec 2023 23:52:06 +0100 Subject: [PATCH 22/22] Update xtts.md --- docs/source/models/xtts.md | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md index 92a981d765..527dd3d068 100644 --- a/docs/source/models/xtts.md +++ b/docs/source/models/xtts.md @@ -204,16 +204,18 @@ If you are not able to acess the video you need to follow the steps: To run the demo locally you need to do the following steps: 1. Install 🐸 TTS following the instructions available [here](https://tts.readthedocs.io/en/dev/installation.html#installation). -2. Install the gradio demo requirements with the command `python3 -m pip install -r TTS/demos/xtts_ft_demo/requirements.txt` -3. Run the gradio demo using the command `python3 TTS/demos/xtts_ft_demo/xtts_demo.py` -4. Follow the steps presented on the [XTTS fine-tuning video]() to be able to fine-tune and use the fine-tuned model. +2. Install the Gradio demo requirements with the command `python3 -m pip install -r TTS/demos/xtts_ft_demo/requirements.txt` +3. Run the Gradio demo using the command `python3 TTS/demos/xtts_ft_demo/xtts_demo.py` +4. Follow the steps presented in the [tutorial video](https://www.youtube.com/watch?v=8tpDiiouGxc&feature=youtu.be) to be able to fine-tune and test the fine-tuned model. -If you are not able to acess the video you need to follow the steps: +If you are not able to access the video, here is what you need to do: -1. On the first Tab (1 - Data processing) you need to select the audio file or files, wait for upload, and then click on the button "Step 1 - Create dataset" and then wait until the dataset processing is done. -2. Soon as the dataset processing is done you need to go to the second Tab (2 - Fine-tuning XTTS Encoder) and press the button "Step 2 - Run the training" and then wait until the training is finished. Note that it can take up to 40 minutes. -3. Soon the training is done you can go to the third Tab (3 - Inference) and then click on the button "Step 3 - Load Fine-tuned XTTS model" and wait until the fine-tuned model is loaded. Then you can do the inference on the model by clicking on the button "Step 4 - Inference". +1. On the first Tab (1 - Data processing) select the audio file or files, wait for upload +2. Click on the button "Step 1 - Create dataset" and then wait until the dataset processing is done. +3. Go to the second Tab (2 - Fine-tuning XTTS Encoder) and press the button "Step 2 - Run the training" and then wait until the training is finished. it will take some time. +4. Go to the third Tab (3 - Inference) and then click on the button "Step 3 - Load Fine-tuned XTTS model" and wait until the fine-tuned model is loaded. +5. Now you can run inference with the model by clicking on the button "Step 4 - Inference". #### Advanced training