Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lang code in XTTS doc #3158

Merged
merged 4 commits into from
Nov 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 0 additions & 22 deletions TTS/tts/configs/xtts_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,29 +37,11 @@ class XttsConfig(BaseTTSConfig):
If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
Defaults to `0.8`.

cond_free_k (float):
Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k. Defaults to `2.0`.

diffusion_temperature (float):
Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
are the "mean" prediction of the diffusion network and will sound bland and smeared.
Defaults to `1.0`.

num_gpt_outputs (int):
Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
As XTTS is a probabilistic model, more samples means a higher probability of creating something "great".
Defaults to `16`.

decoder_iterations (int):
Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine
the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better,
however. Defaults to `30`.

decoder_sampler (str):
Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`.

gpt_cond_len (int):
Secs audio to be used as conditioning for the autoregressive model. Defaults to `3`.

Expand Down Expand Up @@ -110,11 +92,7 @@ class XttsConfig(BaseTTSConfig):
repetition_penalty: float = 2.0
top_k: int = 50
top_p: float = 0.85
cond_free_k: float = 2.0
diffusion_temperature: float = 1.0
num_gpt_outputs: int = 1
decoder_iterations: int = 30
decoder_sampler: str = "ddim"

# cloning
gpt_cond_len: int = 3
Expand Down
37 changes: 0 additions & 37 deletions TTS/tts/models/xtts.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,19 +152,6 @@ class XttsArgs(Coqpit):
gpt_code_stride_len (int, optional): The hop_size of dvae and consequently of the gpt output. Defaults to 1024.
gpt_use_masking_gt_prompt_approach (bool, optional): If True, it will use ground truth as prompt and it will mask the loss to avoid repetition. Defaults to True.
gpt_use_perceiver_resampler (bool, optional): If True, it will use perceiver resampler from flamingo paper - https://arxiv.org/abs/2204.14198. Defaults to False.

For DiffTTS model:
diff_model_channels (int, optional): The number of channels for the DiffTTS model. Defaults to 1024.
diff_num_layers (int, optional): The number of layers for the DiffTTS model. Defaults to 10.
diff_in_channels (int, optional): The input channels for the DiffTTS model. Defaults to 100.
diff_out_channels (int, optional): The output channels for the DiffTTS model. Defaults to 200.
diff_in_latent_channels (int, optional): The input latent channels for the DiffTTS model. Defaults to 1024.
diff_in_tokens (int, optional): The input tokens for the DiffTTS model. Defaults to 8193.
diff_dropout (int, optional): The dropout percentage for the DiffTTS model. Defaults to 0.
diff_use_fp16 (bool, optional): Whether to use fp16 for the DiffTTS model. Defaults to False.
diff_num_heads (int, optional): The number of heads for the DiffTTS model. Defaults to 16.
diff_layer_drop (int, optional): The layer dropout percentage for the DiffTTS model. Defaults to 0.
diff_unconditioned_percentage (int, optional): The percentage of unconditioned inputs for the DiffTTS model. Defaults to 0.
"""

gpt_batch_size: int = 1
Expand Down Expand Up @@ -193,19 +180,6 @@ class XttsArgs(Coqpit):
gpt_use_masking_gt_prompt_approach: bool = True
gpt_use_perceiver_resampler: bool = False

# Diffusion Decoder params
diff_model_channels: int = 1024
diff_num_layers: int = 10
diff_in_channels: int = 100
diff_out_channels: int = 200
diff_in_latent_channels: int = 1024
diff_in_tokens: int = 8193
diff_dropout: int = 0
diff_use_fp16: bool = False
diff_num_heads: int = 16
diff_layer_drop: int = 0
diff_unconditioned_percentage: int = 0

# HifiGAN Decoder params
input_sample_rate: int = 22050
output_sample_rate: int = 24000
Expand Down Expand Up @@ -426,10 +400,6 @@ def inference_with_config(self, text, config, ref_audio_path, language, **kwargs
"repetition_penalty": config.repetition_penalty,
"top_k": config.top_k,
"top_p": config.top_p,
"cond_free_k": config.cond_free_k,
"diffusion_temperature": config.diffusion_temperature,
"decoder_iterations": config.decoder_iterations,
"decoder_sampler": config.decoder_sampler,
"gpt_cond_len": config.gpt_cond_len,
"max_ref_len": config.max_ref_len,
"sound_norm_refs": config.sound_norm_refs,
Expand All @@ -454,13 +424,6 @@ def full_inference(
gpt_cond_len=6,
max_ref_len=10,
sound_norm_refs=False,
# Decoder inference
decoder_iterations=100,
cond_free=True,
cond_free_k=2,
diffusion_temperature=1.0,
decoder_sampler="ddim",
decoder="hifigan",
**hf_generate_kwargs,
):
"""
Expand Down
11 changes: 4 additions & 7 deletions docs/source/models/xtts.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ a few tricks to make it faster and support streaming inference.
Current implementation only supports inference.

### Languages
As of now, XTTS-v2 supports 16 languages: English, Spanish, French, German, Italian, Portuguese,
Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese (Simplified), Japanese, Hungarian, Korean
As of now, XTTS-v2 supports 16 languages: English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt), Polish (pl), Turkish (tr), Russian (ru), Dutch (nl), Czech (cs), Arabic (ar), Chinese (zh-cn), Japanese (ja), Hungarian (hu) and Korean (ko).

Stay tuned as we continue to add support for more languages. If you have any language requests, please feel free to reach out.

Expand Down Expand Up @@ -116,15 +115,14 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru
model.cuda()

print("Computing speaker latents...")
gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])

print("Inference...")
out = model.inference(
"It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
"en",
gpt_cond_latent,
speaker_embedding,
diffusion_conditioning,
temperature=0.7, # Add custom parameters here
)
torchaudio.save("xtts.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
Expand Down Expand Up @@ -153,7 +151,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru
model.cuda()

print("Computing speaker latents...")
gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])

print("Inference...")
t0 = time.time()
Expand Down Expand Up @@ -210,15 +208,14 @@ model.load_checkpoint(config, checkpoint_path=XTTS_CHECKPOINT, vocab_path=TOKENI
model.cuda()

print("Computing speaker latents...")
gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE])
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE])

print("Inference...")
out = model.inference(
"It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
"en",
gpt_cond_latent,
speaker_embedding,
diffusion_conditioning,
temperature=0.7, # Add custom parameters here
)
torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(out["wav"]).unsqueeze(0), 24000)
Expand Down
Loading