Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run CI for v0.20.6 #3249

Merged
merged 7 commits into from
Nov 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/pypi-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
build-sdist:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Verify tag matches version
run: |
set -ex
Expand Down Expand Up @@ -38,7 +38,7 @@ jobs:
matrix:
python-version: ["3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
Expand Down
4 changes: 2 additions & 2 deletions TTS/.models.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
"multilingual": {
"multi-dataset": {
"xtts_v2": {
"description": "XTTS-v2 by Coqui with 16 languages.",
"description": "XTTS-v2.0.2 by Coqui with 16 languages.",
"hf_url": [
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5"
],
"model_hash": "6a09d1ad43896f06041ed8195956c9698f13b6189dc80f1c74bdc2b8e8d15324",
"model_hash": "5ce0502bfe3bc88dc8d9312b12a7558c",
"default_vocoder": null,
"commit": "480a6cdf7",
"license": "CPML",
Expand Down
2 changes: 1 addition & 1 deletion TTS/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.20.5
0.20.6
12 changes: 10 additions & 2 deletions TTS/tts/layers/tortoise/diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,18 @@
import numpy as np
import torch
import torch as th
from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral
from tqdm import tqdm

from TTS.tts.layers.tortoise.dpm_solver import DPM_Solver, NoiseScheduleVP, model_wrapper

K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m}
try:
from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral

K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m}
except ImportError:
K_DIFFUSION_SAMPLERS = None


SAMPLERS = ["dpm++2m", "p", "ddim"]


Expand Down Expand Up @@ -531,6 +537,8 @@ def sample_loop(self, *args, **kwargs):
if self.conditioning_free is not True:
raise RuntimeError("cond_free must be true")
with tqdm(total=self.num_timesteps) as pbar:
if K_DIFFUSION_SAMPLERS is None:
raise ModuleNotFoundError("Install k_diffusion for using k_diffusion samplers")
return self.k_diffusion_sample_loop(K_DIFFUSION_SAMPLERS[s], pbar, *args, **kwargs)
else:
raise RuntimeError("sampler not impl")
Expand Down
4 changes: 3 additions & 1 deletion TTS/tts/layers/xtts/gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,9 @@ def forward(
audio_codes = F.pad(audio_codes[:, :max_mel_len], (0, 1), value=self.stop_audio_token)

# Pad mel codes with stop_audio_token
audio_codes = self.set_mel_padding(audio_codes, code_lengths - 3) # -3 to get the real code lengths without consider start and stop tokens that was not added yet
audio_codes = self.set_mel_padding(
audio_codes, code_lengths - 3
) # -3 to get the real code lengths without consider start and stop tokens that was not added yet

# Build input and target tensors
# Prepend start token to inputs and append stop token to targets
Expand Down
23 changes: 12 additions & 11 deletions TTS/tts/layers/xtts/tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,22 @@
import os
import re
import torch
import pypinyin
import textwrap

from functools import cached_property

import pypinyin
import torch
from hangul_romanize import Transliter
from hangul_romanize.rule import academic
from num2words import num2words
from spacy.lang.ar import Arabic
from spacy.lang.en import English
from spacy.lang.es import Spanish
from spacy.lang.ja import Japanese
from spacy.lang.zh import Chinese
from tokenizers import Tokenizer

from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words

from spacy.lang.en import English
from spacy.lang.zh import Chinese
from spacy.lang.ja import Japanese
from spacy.lang.ar import Arabic
from spacy.lang.es import Spanish


def get_spacy_lang(lang):
if lang == "zh":
Expand All @@ -32,6 +31,7 @@ def get_spacy_lang(lang):
# For most languages, Enlish does the job
return English()


def split_sentence(text, lang, text_split_length=250):
"""Preprocess the input text"""
text_splits = []
Expand Down Expand Up @@ -67,6 +67,7 @@ def split_sentence(text, lang, text_split_length=250):

return text_splits


_whitespace_re = re.compile(r"\s+")

# List of (regular expression, replacement) pairs for abbreviations:
Expand Down Expand Up @@ -619,7 +620,7 @@ def katsu(self):
return cutlet.Cutlet()

def check_input_length(self, txt, lang):
lang = lang.split("-")[0] # remove the region
lang = lang.split("-")[0] # remove the region
limit = self.char_limits.get(lang, 250)
if len(txt) > limit:
print(
Expand All @@ -640,7 +641,7 @@ def preprocess_text(self, txt, lang):
return txt

def encode(self, txt, lang):
lang = lang.split("-")[0] # remove the region
lang = lang.split("-")[0] # remove the region
self.check_input_length(txt, lang)
txt = self.preprocess_text(txt, lang)
lang = "zh-cn" if lang == "zh" else lang
Expand Down
7 changes: 4 additions & 3 deletions TTS/tts/layers/xtts/trainer/gpt_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,9 +318,10 @@ def eval_step(self, batch, criterion):
batch["cond_idxs"] = None
return self.train_step(batch, criterion)

def on_epoch_start(self, trainer): # pylint: disable=W0613
# guarante that dvae will be in eval mode after .train() on evaluation end
self.dvae = self.dvae.eval()
def on_train_epoch_start(self, trainer):
trainer.model.eval() # the whole model to eval
# put gpt model in training mode
trainer.model.xtts.gpt.train()

def on_init_end(self, trainer): # pylint: disable=W0613
# ignore similarities.pth on clearml save/upload
Expand Down
14 changes: 5 additions & 9 deletions TTS/tts/models/xtts.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,13 +513,13 @@ def inference(
enable_text_splitting=False,
**hf_generate_kwargs,
):
language = language.split("-")[0] # remove the country code
language = language.split("-")[0] # remove the country code
length_scale = 1.0 / max(speed, 0.05)
if enable_text_splitting:
text = split_sentence(text, language, self.tokenizer.char_limits[language])
else:
text = [text]

wavs = []
gpt_latents_list = []
for sent in text:
Expand Down Expand Up @@ -563,9 +563,7 @@ def inference(

if length_scale != 1.0:
gpt_latents = F.interpolate(
gpt_latents.transpose(1, 2),
scale_factor=length_scale,
mode="linear"
gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear"
).transpose(1, 2)

gpt_latents_list.append(gpt_latents.cpu())
Expand Down Expand Up @@ -623,7 +621,7 @@ def inference_stream(
enable_text_splitting=False,
**hf_generate_kwargs,
):
language = language.split("-")[0] # remove the country code
language = language.split("-")[0] # remove the country code
length_scale = 1.0 / max(speed, 0.05)
if enable_text_splitting:
text = split_sentence(text, language, self.tokenizer.char_limits[language])
Expand Down Expand Up @@ -675,9 +673,7 @@ def inference_stream(
gpt_latents = torch.cat(all_latents, dim=0)[None, :]
if length_scale != 1.0:
gpt_latents = F.interpolate(
gpt_latents.transpose(1, 2),
scale_factor=length_scale,
mode="linear"
gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear"
).transpose(1, 2)
wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device))
wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks(
Expand Down
1 change: 1 addition & 0 deletions TTS/vocoder/configs/parallel_wavegan_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ class ParallelWaveganConfig(BaseGANVocoderConfig):
use_noise_augment: bool = False
use_cache: bool = True
steps_to_start_discriminator: int = 200000
target_loss: str = "loss_1"

# LOSS PARAMETERS - overrides
use_stft_loss: bool = True
Expand Down
45 changes: 22 additions & 23 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,33 +1,33 @@
# core deps
numpy==1.22.0;python_version<="3.10"
numpy==1.24.3;python_version>"3.10"
cython==0.29.30
numpy>=1.24.3;python_version>"3.10"
cython>=0.29.30
scipy>=1.11.2
torch>=2.1
torchaudio
soundfile==0.12.*
librosa==0.10.*
scikit-learn==1.3.0
soundfile>=0.12.0
librosa>=0.10.0
scikit-learn>=1.3.0
numba==0.55.1;python_version<"3.9"
numba==0.57.0;python_version>="3.9"
inflect==5.6.*
tqdm==4.64.*
anyascii==0.3.*
pyyaml==6.*
fsspec==2023.6.0 # <= 2023.9.1 makes aux tests fail
aiohttp==3.8.*
packaging==23.1
numba>=0.57.0;python_version>="3.9"
inflect>=5.6.0
tqdm>=4.64.1
anyascii>=0.3.0
pyyaml>=6.0
fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail
aiohttp>=3.8.1
packaging>=23.1
# deps for examples
flask==2.*
flask>=2.0.1
# deps for inference
pysbd==0.3.4
pysbd>=0.3.4
# deps for notebooks
umap-learn==0.5.*
umap-learn>=0.5.1
pandas>=1.4,<2.0
# deps for training
matplotlib==3.7.*
matplotlib>=3.7.0
# coqui stack
trainer
trainer>=0.0.32
# config management
coqpit>=0.0.16
# chinese g2p deps
Expand All @@ -46,12 +46,11 @@ bangla
bnnumerizer
bnunicodenormalizer
#deps for tortoise
k_diffusion
einops==0.6.*
transformers==4.33.*
einops>=0.6.0
transformers>=4.33.0
#deps for bark
encodec==0.1.*
encodec>=0.1.1
# deps for XTTS
unidecode==1.3.*
unidecode>=1.3.2
num2words
spacy[ja]>=3
4 changes: 2 additions & 2 deletions tests/zoo_tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def test_xtts_v2_streaming():
"en",
gpt_cond_latent,
speaker_embedding,
speed=1.5
speed=1.5,
)
wav_chuncks = []
for i, chunk in enumerate(chunks):
Expand All @@ -198,7 +198,7 @@ def test_xtts_v2_streaming():
"en",
gpt_cond_latent,
speaker_embedding,
speed=0.66
speed=0.66,
)
wav_chuncks = []
for i, chunk in enumerate(chunks):
Expand Down
Loading