Skip to content

Commit

Permalink
Update XTTS docs
Browse files Browse the repository at this point in the history
  • Loading branch information
Edresson committed Nov 6, 2023
1 parent 5918e6e commit aa0bbce
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 7 deletions.
6 changes: 3 additions & 3 deletions TTS/tts/models/xtts.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ def get_gpt_cond_latents(self, audio, sr, length: int = 3):
"""Compute the conditioning latents for the GPT model from the given audio.
Args:
audio_path (str): Path to the audio file.
audio (tensor): audio tensor.
sr (int): Sample rate of the audio.
length (int): Length of the audio in seconds. Defaults to 3.
"""
Expand Down Expand Up @@ -447,7 +447,7 @@ def get_conditioning_latents(
audio_paths = list(audio_path)
else:
audio_paths = audio_path

print(audio_paths)
speaker_embeddings = []
diffusion_cond_latents = []
audios = []
Expand Down Expand Up @@ -492,7 +492,7 @@ def synthesize(self, text, config, speaker_wav, language, **kwargs):
Args:
text (str): Input text.
config (XttsConfig): Config with inference parameters.
speaker_wav (str): Path to the speaker audio file for cloning.
speaker_wav (list): List of paths to the speaker audio files to be used for cloning.
language (str): Language ID of the speaker.
**kwargs: Inference settings. See `inference()`.
Expand Down
41 changes: 37 additions & 4 deletions docs/source/models/xtts.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,19 +39,33 @@ You can also mail us at info@coqui.ai.
### Inference
#### 🐸TTS API

##### Single reference
```python
from TTS.api import TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)

# generate speech by cloning a voice using default settings
tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
file_path="output.wav",
speaker_wav="/path/to/target/speaker.wav",
speaker_wav=["/path/to/target/speaker.wav"],
language="en")
```

##### Multiple references
```python
from TTS.api import TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)

# generate speech by cloning a voice using default settings
tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
file_path="output.wav",
speaker_wav=["/path/to/target/speaker.wav", "/path/to/target/speaker_2.wav", "/path/to/target/speaker_3.wav"],
language="en")
```

#### 🐸TTS Command line

##### Single reference
```console
tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
--text "Bugün okula gitmek istemiyorum." \
Expand All @@ -60,6 +74,25 @@ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now t
--use_cuda true
```

##### Multiple references
```console
tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
--text "Bugün okula gitmek istemiyorum." \
--speaker_wav /path/to/target/speaker.wav /path/to/target/speaker_2.wav /path/to/target/speaker_3.wav \
--language_idx tr \
--use_cuda true
```
or for all wav files in a directory you can use:

```console
tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
--text "Bugün okula gitmek istemiyorum." \
--speaker_wav /path/to/target/*.wav \
--language_idx tr \
--use_cuda true
```


#### model directly

If you want to be able to run with `use_deepspeed=True` and enjoy the speedup, you need to install deepspeed first.
Expand All @@ -83,7 +116,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru
model.cuda()

print("Computing speaker latents...")
gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path="reference.wav")
gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])

print("Inference...")
out = model.inference(
Expand Down Expand Up @@ -120,7 +153,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru
model.cuda()

print("Computing speaker latents...")
gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path="reference.wav")
gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])

print("Inference...")
t0 = time.time()
Expand Down Expand Up @@ -177,7 +210,7 @@ model.load_checkpoint(config, checkpoint_path=XTTS_CHECKPOINT, vocab_path=TOKENI
model.cuda()

print("Computing speaker latents...")
gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=SPEAKER_REFERENCE)
gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE])

print("Inference...")
out = model.inference(
Expand Down

0 comments on commit aa0bbce

Please sign in to comment.