Skip to content

Commit

Permalink
Remove coqui studio integration from TTS
Browse files Browse the repository at this point in the history
  • Loading branch information
WeberJulian committed Dec 11, 2023
1 parent 5cd750a commit 8c20a59
Show file tree
Hide file tree
Showing 11 changed files with 33 additions and 782 deletions.
53 changes: 0 additions & 53 deletions .github/workflows/api_tests.yml

This file was deleted.

3 changes: 0 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,6 @@ test_zoo: ## run zoo tests.
inference_tests: ## run inference tests.
nose2 -F -v -B --with-coverage --coverage TTS tests.inference_tests

api_tests: ## run api tests.
nose2 -F -v -B --with-coverage --coverage TTS tests.api_tests

data_tests: ## run data tests.
nose2 -F -v -B --with-coverage --coverage TTS tests.data_tests

Expand Down
29 changes: 0 additions & 29 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
- 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
- 📣 🐸TTS now supports 🐢Tortoise with faster inference. [Docs](https://tts.readthedocs.io/en/dev/models/tortoise.html)
- 📣 **Coqui Studio API** is landed on 🐸TTS. - [Example](https://github.com/coqui-ai/TTS/blob/dev/README.md#-python-api)
- 📣 [**Coqui Studio API**](https://docs.coqui.ai/docs) is live.
- 📣 Voice generation with prompts - **Prompt to Voice** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin)!! - [Blog Post](https://coqui.ai/blog/tts/prompt-to-voice)
- 📣 Voice generation with fusion - **Voice fusion** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin).
- 📣 Voice cloning is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin).
Expand Down Expand Up @@ -253,29 +251,6 @@ tts.tts_with_vc_to_file(
)
```

#### Example using [🐸Coqui Studio](https://coqui.ai) voices.
You access all of your cloned voices and built-in speakers in [🐸Coqui Studio](https://coqui.ai).
To do this, you'll need an API token, which you can obtain from the [account page](https://coqui.ai/account).
After obtaining the API token, you'll need to configure the COQUI_STUDIO_TOKEN environment variable.

Once you have a valid API token in place, the studio speakers will be displayed as distinct models within the list.
These models will follow the naming convention `coqui_studio/en/<studio_speaker_name>/coqui_studio`

```python
# XTTS model
models = TTS(cs_api_model="XTTS").list_models()
# Init TTS with the target studio speaker
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
# Run TTS
tts.tts_to_file(text="This is a test.", language="en", file_path=OUTPUT_PATH)

# V1 model
models = TTS(cs_api_model="V1").list_models()
# Run TTS with emotion and speed control
# Emotion control only works with V1 model
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
```

#### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
For Fairseq models, use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
You can find the language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html)
Expand Down Expand Up @@ -353,10 +328,6 @@ If you don't specify any models, then it uses LJSpeech based English model.
- Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0:
```
$ tts --text "Text for TTS" --model_name "coqui_studio/<language>/<dataset>/<model_name>" --speed 1.2 --out_path output/path/speech.wav
```
- Run a TTS model with its default vocoder model:
```
Expand Down
158 changes: 30 additions & 128 deletions TTS/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import numpy as np
from torch import nn

from TTS.cs_api import CS_API
from TTS.utils.audio.numpy_transforms import save_wav
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer
Expand All @@ -24,7 +23,6 @@ def __init__(
vocoder_path: str = None,
vocoder_config_path: str = None,
progress_bar: bool = True,
cs_api_model: str = "XTTS",
gpu=False,
):
"""🐸TTS python interface that allows to load and use the released models.
Expand Down Expand Up @@ -60,24 +58,19 @@ def __init__(
vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are
"XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control.
Defaults to "XTTS".
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
"""
super().__init__()
self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
self.config = load_config(config_path) if config_path else None
self.synthesizer = None
self.voice_converter = None
self.csapi = None
self.cs_api_model = cs_api_model
self.model_name = ""
if gpu:
warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")

if model_name is not None and len(model_name) > 0:
if "tts_models" in model_name or "coqui_studio" in model_name:
if "tts_models" in model_name:
self.load_tts_model_by_name(model_name, gpu)
elif "voice_conversion_models" in model_name:
self.load_vc_model_by_name(model_name, gpu)
Expand All @@ -99,12 +92,6 @@ def is_multi_speaker(self):
return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
return False

@property
def is_coqui_studio(self):
if self.model_name is None:
return False
return "coqui_studio" in self.model_name

@property
def is_multi_lingual(self):
# Not sure what sets this to None, but applied a fix to prevent crashing.
Expand Down Expand Up @@ -136,14 +123,7 @@ def get_models_file_path():
return Path(__file__).parent / ".models.json"

def list_models(self):
try:
csapi = CS_API(model=self.cs_api_model)
models = csapi.list_speakers_as_tts_models()
except ValueError as e:
print(e)
models = []
manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
return manager.list_tts_models() + models
return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)

This comment has been minimized.

Copy link
@franua

franua Jun 11, 2024

It seem that this change has broken the "interface" and now the method returns a ModelManager (a new one, not even the self.manager) instead of a list that it was returning before and what's written in the documentation.
Now print(TTS().list_models()) outputs <TTS.utils.manage.ModelManager object at 0x363b7a910> and to actually get the list one must call either TTS().list_models().list_models() or TTS().manager.list_models().

This comment has been minimized.

Copy link
@eginhard

eginhard Jun 11, 2024

Contributor

Yes - this repo is not updated anymore, but this has already been fixed in our fork, available via pip install coqui-tts

This comment has been minimized.

Copy link
@franua

franua Jun 11, 2024

Oh, i see, it's a shame... Thank you very much though for your reply and pointing me to the right direction @eginhard ! 🙏🏻


def download_model_by_name(self, model_name: str):
model_path, config_path, model_item = self.manager.download_model(model_name)
Expand Down Expand Up @@ -186,30 +166,26 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
TODO: Add tests
"""
self.synthesizer = None
self.csapi = None
self.model_name = model_name

if "coqui_studio" in model_name:
self.csapi = CS_API()
else:
model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
model_name
)
model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
model_name
)

# init synthesizer
# None values are fetch from the model
self.synthesizer = Synthesizer(
tts_checkpoint=model_path,
tts_config_path=config_path,
tts_speakers_file=None,
tts_languages_file=None,
vocoder_checkpoint=vocoder_path,
vocoder_config=vocoder_config_path,
encoder_checkpoint=None,
encoder_config=None,
model_dir=model_dir,
use_cuda=gpu,
)
# init synthesizer
# None values are fetch from the model
self.synthesizer = Synthesizer(
tts_checkpoint=model_path,
tts_config_path=config_path,
tts_speakers_file=None,
tts_languages_file=None,
vocoder_checkpoint=vocoder_path,
vocoder_config=vocoder_config_path,
encoder_checkpoint=None,
encoder_config=None,
model_dir=model_dir,
use_cuda=gpu,
)

def load_tts_model_by_path(
self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
Expand Down Expand Up @@ -246,77 +222,17 @@ def _check_arguments(
**kwargs,
) -> None:
"""Check if the arguments are valid for the model."""
if not self.is_coqui_studio:
# check for the coqui tts models
if self.is_multi_speaker and (speaker is None and speaker_wav is None):
raise ValueError("Model is multi-speaker but no `speaker` is provided.")
if self.is_multi_lingual and language is None:
raise ValueError("Model is multi-lingual but no `language` is provided.")
if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
raise ValueError("Model is not multi-speaker but `speaker` is provided.")
if not self.is_multi_lingual and language is not None:
raise ValueError("Model is not multi-lingual but `language` is provided.")
if not emotion is None and not speed is None:
raise ValueError("Emotion and speed can only be used with Coqui Studio models.")
else:
if emotion is None:
emotion = "Neutral"
if speed is None:
speed = 1.0
# check for the studio models
if speaker_wav is not None:
raise ValueError("Coqui Studio models do not support `speaker_wav` argument.")
if speaker is not None:
raise ValueError("Coqui Studio models do not support `speaker` argument.")
if language is not None and language != "en":
raise ValueError("Coqui Studio models currently support only `language=en` argument.")
if emotion not in ["Neutral", "Happy", "Sad", "Angry", "Dull"]:
raise ValueError(f"Emotion - `{emotion}` - must be one of `Neutral`, `Happy`, `Sad`, `Angry`, `Dull`.")

def tts_coqui_studio(
self,
text: str,
speaker_name: str = None,
language: str = None,
emotion: str = None,
speed: float = 1.0,
pipe_out=None,
file_path: str = None,
) -> Union[np.ndarray, str]:
"""Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
Args:
text (str):
Input text to synthesize.
speaker_name (str, optional):
Speaker name from Coqui Studio. Defaults to None.
language (str): Language of the text. If None, the default language of the speaker is used. Language is only
supported by `XTTS` model.
emotion (str, optional):
Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available
with "V1" model. Defaults to None.
speed (float, optional):
Speed of the speech. Defaults to 1.0.
pipe_out (BytesIO, optional):
Flag to stdout the generated TTS wav file for shell pipe.
file_path (str, optional):
Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
Returns:
Union[np.ndarray, str]: Waveform of the synthesized speech or path to the output file.
"""
speaker_name = self.model_name.split("/")[2]
if file_path is not None:
return self.csapi.tts_to_file(
text=text,
speaker_name=speaker_name,
language=language,
speed=speed,
pipe_out=pipe_out,
emotion=emotion,
file_path=file_path,
)[0]
return self.csapi.tts(text=text, speaker_name=speaker_name, language=language, speed=speed, emotion=emotion)[0]
# check for the coqui tts models
if self.is_multi_speaker and (speaker is None and speaker_wav is None):
raise ValueError("Model is multi-speaker but no `speaker` is provided.")
if self.is_multi_lingual and language is None:
raise ValueError("Model is multi-lingual but no `language` is provided.")
if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
raise ValueError("Model is not multi-speaker but `speaker` is provided.")
if not self.is_multi_lingual and language is not None:
raise ValueError("Model is not multi-lingual but `language` is provided.")
if not emotion is None and not speed is None:
raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.")

def tts(
self,
Expand Down Expand Up @@ -357,10 +273,6 @@ def tts(
self._check_arguments(
speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
)
if self.csapi is not None:
return self.tts_coqui_studio(
text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed
)
wav = self.synthesizer.tts(
text=text,
speaker_name=speaker,
Expand Down Expand Up @@ -419,16 +331,6 @@ def tts_to_file(
"""
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)

if self.csapi is not None:
return self.tts_coqui_studio(
text=text,
speaker_name=speaker,
language=language,
emotion=emotion,
speed=speed,
file_path=file_path,
pipe_out=pipe_out,
)
wav = self.tts(
text=text,
speaker=speaker,
Expand Down
Loading

0 comments on commit 8c20a59

Please sign in to comment.