From aa0bbce64196f1e05e1461c9067d477948ec25a4 Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Mon, 6 Nov 2023 14:18:02 -0300
Subject: [PATCH] Update XTTS docs

---
 TTS/tts/models/xtts.py     |  6 +++---
 docs/source/models/xtts.md | 41 ++++++++++++++++++++++++++++++++++----
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py
index 9a501db8d3..05e4d36941 100644
--- a/TTS/tts/models/xtts.py
+++ b/TTS/tts/models/xtts.py
@@ -373,7 +373,7 @@ def get_gpt_cond_latents(self, audio, sr, length: int = 3):
         """Compute the conditioning latents for the GPT model from the given audio.
 
         Args:
-            audio_path (str): Path to the audio file.
+            audio (tensor): audio tensor.
             sr (int): Sample rate of the audio.
             length (int): Length of the audio in seconds. Defaults to 3.
         """
@@ -447,7 +447,7 @@ def get_conditioning_latents(
             audio_paths = list(audio_path)
         else:
             audio_paths = audio_path
-
+        print(audio_paths)
         speaker_embeddings = []
         diffusion_cond_latents = []
         audios = []
@@ -492,7 +492,7 @@ def synthesize(self, text, config, speaker_wav, language, **kwargs):
         Args:
             text (str): Input text.
             config (XttsConfig): Config with inference parameters.
-            speaker_wav (str): Path to the speaker audio file for cloning.
+            speaker_wav (list): List of paths to the speaker audio files to be used for cloning.
             language (str): Language ID of the speaker.
             **kwargs: Inference settings. See `inference()`.
 
diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md
index 1d034aeadf..8167a1d1a9 100644
--- a/docs/source/models/xtts.md
+++ b/docs/source/models/xtts.md
@@ -39,6 +39,7 @@ You can also mail us at info@coqui.ai.
 ### Inference
 #### 🐸TTS API
 
+##### Single reference
 ```python
 from TTS.api import TTS
 tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
@@ -46,12 +47,25 @@ tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
 # generate speech by cloning a voice using default settings
 tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
                 file_path="output.wav",
-                speaker_wav="/path/to/target/speaker.wav",
+                speaker_wav=["/path/to/target/speaker.wav"],
+                language="en")
+```
+
+##### Multiple references
+```python
+from TTS.api import TTS
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
+
+# generate speech by cloning a voice using default settings
+tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+                file_path="output.wav",
+                speaker_wav=["/path/to/target/speaker.wav", "/path/to/target/speaker_2.wav", "/path/to/target/speaker_3.wav"],
                 language="en")
 ```
 
 #### 🐸TTS Command line
 
+##### Single reference
 ```console
  tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
      --text "Bugün okula gitmek istemiyorum." \
@@ -60,6 +74,25 @@ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now t
      --use_cuda true
 ```
 
+##### Multiple references
+```console
+ tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
+     --text "Bugün okula gitmek istemiyorum." \
+     --speaker_wav /path/to/target/speaker.wav /path/to/target/speaker_2.wav /path/to/target/speaker_3.wav \
+     --language_idx tr \
+     --use_cuda true
+```
+or for all wav files in a directory you can use:
+
+```console
+ tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
+     --text "Bugün okula gitmek istemiyorum." \
+     --speaker_wav /path/to/target/*.wav \
+     --language_idx tr \
+     --use_cuda true
+```
+
+
 #### model directly
 
 If you want to be able to run with `use_deepspeed=True` and enjoy the speedup, you need to install deepspeed first.
@@ -83,7 +116,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru
 model.cuda()
 
 print("Computing speaker latents...")
-gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path="reference.wav")
+gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
 
 print("Inference...")
 out = model.inference(
@@ -120,7 +153,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru
 model.cuda()
 
 print("Computing speaker latents...")
-gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path="reference.wav")
+gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
 
 print("Inference...")
 t0 = time.time()
@@ -177,7 +210,7 @@ model.load_checkpoint(config, checkpoint_path=XTTS_CHECKPOINT, vocab_path=TOKENI
 model.cuda()
 
 print("Computing speaker latents...")
-gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=SPEAKER_REFERENCE)
+gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE])
 
 print("Inference...")
 out = model.inference(