From aa0bbce64196f1e05e1461c9067d477948ec25a4 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Mon, 6 Nov 2023 14:18:02 -0300 Subject: [PATCH] Update XTTS docs --- TTS/tts/models/xtts.py | 6 +++--- docs/source/models/xtts.md | 41 ++++++++++++++++++++++++++++++++++---- 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 9a501db8d3..05e4d36941 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -373,7 +373,7 @@ def get_gpt_cond_latents(self, audio, sr, length: int = 3): """Compute the conditioning latents for the GPT model from the given audio. Args: - audio_path (str): Path to the audio file. + audio (tensor): audio tensor. sr (int): Sample rate of the audio. length (int): Length of the audio in seconds. Defaults to 3. """ @@ -447,7 +447,7 @@ def get_conditioning_latents( audio_paths = list(audio_path) else: audio_paths = audio_path - + print(audio_paths) speaker_embeddings = [] diffusion_cond_latents = [] audios = [] @@ -492,7 +492,7 @@ def synthesize(self, text, config, speaker_wav, language, **kwargs): Args: text (str): Input text. config (XttsConfig): Config with inference parameters. - speaker_wav (str): Path to the speaker audio file for cloning. + speaker_wav (list): List of paths to the speaker audio files to be used for cloning. language (str): Language ID of the speaker. **kwargs: Inference settings. See `inference()`. diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md index 1d034aeadf..8167a1d1a9 100644 --- a/docs/source/models/xtts.md +++ b/docs/source/models/xtts.md @@ -39,6 +39,7 @@ You can also mail us at info@coqui.ai. ### Inference #### 🐸TTS API +##### Single reference ```python from TTS.api import TTS tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True) @@ -46,12 +47,25 @@ tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True) # generate speech by cloning a voice using default settings tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", file_path="output.wav", - speaker_wav="/path/to/target/speaker.wav", + speaker_wav=["/path/to/target/speaker.wav"], + language="en") +``` + +##### Multiple references +```python +from TTS.api import TTS +tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True) + +# generate speech by cloning a voice using default settings +tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + file_path="output.wav", + speaker_wav=["/path/to/target/speaker.wav", "/path/to/target/speaker_2.wav", "/path/to/target/speaker_3.wav"], language="en") ``` #### 🐸TTS Command line +##### Single reference ```console tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ --text "Bugün okula gitmek istemiyorum." \ @@ -60,6 +74,25 @@ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now t --use_cuda true ``` +##### Multiple references +```console + tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ + --text "Bugün okula gitmek istemiyorum." \ + --speaker_wav /path/to/target/speaker.wav /path/to/target/speaker_2.wav /path/to/target/speaker_3.wav \ + --language_idx tr \ + --use_cuda true +``` +or for all wav files in a directory you can use: + +```console + tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ + --text "Bugün okula gitmek istemiyorum." \ + --speaker_wav /path/to/target/*.wav \ + --language_idx tr \ + --use_cuda true +``` + + #### model directly If you want to be able to run with `use_deepspeed=True` and enjoy the speedup, you need to install deepspeed first. @@ -83,7 +116,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru model.cuda() print("Computing speaker latents...") -gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path="reference.wav") +gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"]) print("Inference...") out = model.inference( @@ -120,7 +153,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru model.cuda() print("Computing speaker latents...") -gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path="reference.wav") +gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"]) print("Inference...") t0 = time.time() @@ -177,7 +210,7 @@ model.load_checkpoint(config, checkpoint_path=XTTS_CHECKPOINT, vocab_path=TOKENI model.cuda() print("Computing speaker latents...") -gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=SPEAKER_REFERENCE) +gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE]) print("Inference...") out = model.inference(