From 8c20a599d8d4eac32db2f7b8cd9f9b3d1190b73a Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 11 Dec 2023 22:11:46 +0100 Subject: [PATCH] Remove coqui studio integration from TTS --- .github/workflows/api_tests.yml | 53 ----- Makefile | 3 - README.md | 29 --- TTS/api.py | 158 +++--------- TTS/bin/synthesize.py | 53 +---- TTS/cs_api.py | 317 ------------------------- TTS/utils/manage.py | 22 -- docs/source/inference.md | 42 ---- tests/api_tests/__init__.py | 0 tests/api_tests/test_python_api.py | 113 --------- tests/api_tests/test_synthesize_api.py | 25 -- 11 files changed, 33 insertions(+), 782 deletions(-) delete mode 100644 .github/workflows/api_tests.yml delete mode 100644 TTS/cs_api.py delete mode 100644 tests/api_tests/__init__.py delete mode 100644 tests/api_tests/test_python_api.py delete mode 100644 tests/api_tests/test_synthesize_api.py diff --git a/.github/workflows/api_tests.yml b/.github/workflows/api_tests.yml deleted file mode 100644 index 5a3baaad9e..0000000000 --- a/.github/workflows/api_tests.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: api_tests - -on: - push: - branches: - - main -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: | - export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends git make gcc - sudo apt-get install espeak-ng - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Replace scarf urls - run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: make api_tests - env: - COQUI_STUDIO_TOKEN: ${{ secrets.COQUI_STUDIO_TOKEN }} diff --git a/Makefile b/Makefile index 54aa6eeb18..7446848f46 100644 --- a/Makefile +++ b/Makefile @@ -35,9 +35,6 @@ test_zoo: ## run zoo tests. inference_tests: ## run inference tests. nose2 -F -v -B --with-coverage --coverage TTS tests.inference_tests -api_tests: ## run api tests. - nose2 -F -v -B --with-coverage --coverage TTS tests.api_tests - data_tests: ## run data tests. nose2 -F -v -B --with-coverage --coverage TTS tests.data_tests diff --git a/README.md b/README.md index ef16c9b6a1..17c362e099 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,6 @@ - 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html) - 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS. - 📣 🐸TTS now supports 🐢Tortoise with faster inference. [Docs](https://tts.readthedocs.io/en/dev/models/tortoise.html) -- 📣 **Coqui Studio API** is landed on 🐸TTS. - [Example](https://github.com/coqui-ai/TTS/blob/dev/README.md#-python-api) -- 📣 [**Coqui Studio API**](https://docs.coqui.ai/docs) is live. - 📣 Voice generation with prompts - **Prompt to Voice** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin)!! - [Blog Post](https://coqui.ai/blog/tts/prompt-to-voice) - 📣 Voice generation with fusion - **Voice fusion** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin). - 📣 Voice cloning is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin). @@ -253,29 +251,6 @@ tts.tts_with_vc_to_file( ) ``` -#### Example using [🐸Coqui Studio](https://coqui.ai) voices. -You access all of your cloned voices and built-in speakers in [🐸Coqui Studio](https://coqui.ai). -To do this, you'll need an API token, which you can obtain from the [account page](https://coqui.ai/account). -After obtaining the API token, you'll need to configure the COQUI_STUDIO_TOKEN environment variable. - -Once you have a valid API token in place, the studio speakers will be displayed as distinct models within the list. -These models will follow the naming convention `coqui_studio/en//coqui_studio` - -```python -# XTTS model -models = TTS(cs_api_model="XTTS").list_models() -# Init TTS with the target studio speaker -tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False) -# Run TTS -tts.tts_to_file(text="This is a test.", language="en", file_path=OUTPUT_PATH) - -# V1 model -models = TTS(cs_api_model="V1").list_models() -# Run TTS with emotion and speed control -# Emotion control only works with V1 model -tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5) -``` - #### Example text to speech using **Fairseq models in ~1100 languages** 🤯. For Fairseq models, use the following name format: `tts_models//fairseq/vits`. You can find the language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) @@ -353,10 +328,6 @@ If you don't specify any models, then it uses LJSpeech based English model. - Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0: - ``` - $ tts --text "Text for TTS" --model_name "coqui_studio///" --speed 1.2 --out_path output/path/speech.wav - ``` - - Run a TTS model with its default vocoder model: ``` diff --git a/TTS/api.py b/TTS/api.py index b3aa531b7f..7abc188e74 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -6,7 +6,6 @@ import numpy as np from torch import nn -from TTS.cs_api import CS_API from TTS.utils.audio.numpy_transforms import save_wav from TTS.utils.manage import ModelManager from TTS.utils.synthesizer import Synthesizer @@ -24,7 +23,6 @@ def __init__( vocoder_path: str = None, vocoder_config_path: str = None, progress_bar: bool = True, - cs_api_model: str = "XTTS", gpu=False, ): """🐸TTS python interface that allows to load and use the released models. @@ -60,9 +58,6 @@ def __init__( vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None. vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None. progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True. - cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are - "XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control. - Defaults to "XTTS". gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. """ super().__init__() @@ -70,14 +65,12 @@ def __init__( self.config = load_config(config_path) if config_path else None self.synthesizer = None self.voice_converter = None - self.csapi = None - self.cs_api_model = cs_api_model self.model_name = "" if gpu: warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.") if model_name is not None and len(model_name) > 0: - if "tts_models" in model_name or "coqui_studio" in model_name: + if "tts_models" in model_name: self.load_tts_model_by_name(model_name, gpu) elif "voice_conversion_models" in model_name: self.load_vc_model_by_name(model_name, gpu) @@ -99,12 +92,6 @@ def is_multi_speaker(self): return self.synthesizer.tts_model.speaker_manager.num_speakers > 1 return False - @property - def is_coqui_studio(self): - if self.model_name is None: - return False - return "coqui_studio" in self.model_name - @property def is_multi_lingual(self): # Not sure what sets this to None, but applied a fix to prevent crashing. @@ -136,14 +123,7 @@ def get_models_file_path(): return Path(__file__).parent / ".models.json" def list_models(self): - try: - csapi = CS_API(model=self.cs_api_model) - models = csapi.list_speakers_as_tts_models() - except ValueError as e: - print(e) - models = [] - manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False) - return manager.list_tts_models() + models + return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False) def download_model_by_name(self, model_name: str): model_path, config_path, model_item = self.manager.download_model(model_name) @@ -186,30 +166,26 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False): TODO: Add tests """ self.synthesizer = None - self.csapi = None self.model_name = model_name - if "coqui_studio" in model_name: - self.csapi = CS_API() - else: - model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name( - model_name - ) + model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name( + model_name + ) - # init synthesizer - # None values are fetch from the model - self.synthesizer = Synthesizer( - tts_checkpoint=model_path, - tts_config_path=config_path, - tts_speakers_file=None, - tts_languages_file=None, - vocoder_checkpoint=vocoder_path, - vocoder_config=vocoder_config_path, - encoder_checkpoint=None, - encoder_config=None, - model_dir=model_dir, - use_cuda=gpu, - ) + # init synthesizer + # None values are fetch from the model + self.synthesizer = Synthesizer( + tts_checkpoint=model_path, + tts_config_path=config_path, + tts_speakers_file=None, + tts_languages_file=None, + vocoder_checkpoint=vocoder_path, + vocoder_config=vocoder_config_path, + encoder_checkpoint=None, + encoder_config=None, + model_dir=model_dir, + use_cuda=gpu, + ) def load_tts_model_by_path( self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False @@ -246,77 +222,17 @@ def _check_arguments( **kwargs, ) -> None: """Check if the arguments are valid for the model.""" - if not self.is_coqui_studio: - # check for the coqui tts models - if self.is_multi_speaker and (speaker is None and speaker_wav is None): - raise ValueError("Model is multi-speaker but no `speaker` is provided.") - if self.is_multi_lingual and language is None: - raise ValueError("Model is multi-lingual but no `language` is provided.") - if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs: - raise ValueError("Model is not multi-speaker but `speaker` is provided.") - if not self.is_multi_lingual and language is not None: - raise ValueError("Model is not multi-lingual but `language` is provided.") - if not emotion is None and not speed is None: - raise ValueError("Emotion and speed can only be used with Coqui Studio models.") - else: - if emotion is None: - emotion = "Neutral" - if speed is None: - speed = 1.0 - # check for the studio models - if speaker_wav is not None: - raise ValueError("Coqui Studio models do not support `speaker_wav` argument.") - if speaker is not None: - raise ValueError("Coqui Studio models do not support `speaker` argument.") - if language is not None and language != "en": - raise ValueError("Coqui Studio models currently support only `language=en` argument.") - if emotion not in ["Neutral", "Happy", "Sad", "Angry", "Dull"]: - raise ValueError(f"Emotion - `{emotion}` - must be one of `Neutral`, `Happy`, `Sad`, `Angry`, `Dull`.") - - def tts_coqui_studio( - self, - text: str, - speaker_name: str = None, - language: str = None, - emotion: str = None, - speed: float = 1.0, - pipe_out=None, - file_path: str = None, - ) -> Union[np.ndarray, str]: - """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API. - - Args: - text (str): - Input text to synthesize. - speaker_name (str, optional): - Speaker name from Coqui Studio. Defaults to None. - language (str): Language of the text. If None, the default language of the speaker is used. Language is only - supported by `XTTS` model. - emotion (str, optional): - Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available - with "V1" model. Defaults to None. - speed (float, optional): - Speed of the speech. Defaults to 1.0. - pipe_out (BytesIO, optional): - Flag to stdout the generated TTS wav file for shell pipe. - file_path (str, optional): - Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None. - - Returns: - Union[np.ndarray, str]: Waveform of the synthesized speech or path to the output file. - """ - speaker_name = self.model_name.split("/")[2] - if file_path is not None: - return self.csapi.tts_to_file( - text=text, - speaker_name=speaker_name, - language=language, - speed=speed, - pipe_out=pipe_out, - emotion=emotion, - file_path=file_path, - )[0] - return self.csapi.tts(text=text, speaker_name=speaker_name, language=language, speed=speed, emotion=emotion)[0] + # check for the coqui tts models + if self.is_multi_speaker and (speaker is None and speaker_wav is None): + raise ValueError("Model is multi-speaker but no `speaker` is provided.") + if self.is_multi_lingual and language is None: + raise ValueError("Model is multi-lingual but no `language` is provided.") + if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs: + raise ValueError("Model is not multi-speaker but `speaker` is provided.") + if not self.is_multi_lingual and language is not None: + raise ValueError("Model is not multi-lingual but `language` is provided.") + if not emotion is None and not speed is None: + raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.") def tts( self, @@ -357,10 +273,6 @@ def tts( self._check_arguments( speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs ) - if self.csapi is not None: - return self.tts_coqui_studio( - text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed - ) wav = self.synthesizer.tts( text=text, speaker_name=speaker, @@ -419,16 +331,6 @@ def tts_to_file( """ self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs) - if self.csapi is not None: - return self.tts_coqui_studio( - text=text, - speaker_name=speaker, - language=language, - emotion=emotion, - speed=speed, - file_path=file_path, - pipe_out=pipe_out, - ) wav = self.tts( text=text, speaker=speaker, diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index d9ec3063e6..b125baf7c3 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -66,12 +66,6 @@ $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay ``` -- Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0: - - ``` - $ tts --text "Text for TTS" --model_name "coqui_studio///" --speed 1.2 --out_path output/path/speech.wav - ``` - - Run a TTS model with its default vocoder model: ``` @@ -222,25 +216,6 @@ def main(): default=None, ) parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None) - - # args for coqui studio - parser.add_argument( - "--cs_model", - type=str, - help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `V1`.", - ) - parser.add_argument( - "--emotion", - type=str, - help="Emotion to condition the model with. Only available for 🐸Coqui Studio `V1` model.", - default=None, - ) - parser.add_argument( - "--language", - type=str, - help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS` model.", - default=None, - ) parser.add_argument( "--pipe_out", help="stdout the generated TTS wav file for shell pipe.", @@ -249,13 +224,7 @@ def main(): const=True, default=False, ) - parser.add_argument( - "--speed", - type=float, - help="Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0.", - default=None, - ) - + # args for multi-speaker synthesis parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None) @@ -389,7 +358,6 @@ def main(): # CASE1 #list : list pre-trained TTS models if args.list_models: - manager.add_cs_api_models(api.list_models()) manager.list_models() sys.exit() @@ -404,21 +372,6 @@ def main(): manager.model_info_by_full_name(model_query_full_name) sys.exit() - # CASE3: TTS with coqui studio models - if "coqui_studio" in args.model_name: - print(" > Using 🐸Coqui Studio model: ", args.model_name) - api = TTS(model_name=args.model_name, cs_api_model=args.cs_model) - api.tts_to_file( - text=args.text, - emotion=args.emotion, - file_path=args.out_path, - language=args.language, - speed=args.speed, - pipe_out=pipe_out, - ) - print(" > Saving output to ", args.out_path) - return - if args.language_idx is None and args.language is not None: msg = ( "--language is only supported for Coqui Studio models. " @@ -426,7 +379,7 @@ def main(): ) raise ValueError(msg) - # CASE4: load pre-trained model paths + # CASE3: load pre-trained model paths if args.model_name is not None and not args.model_path: model_path, config_path, model_item = manager.download_model(args.model_name) # tts model @@ -454,7 +407,7 @@ def main(): if args.vocoder_name is not None and not args.vocoder_path: vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) - # CASE5: set custom model paths + # CASE4: set custom model paths if args.model_path is not None: tts_path = args.model_path tts_config_path = args.config_path diff --git a/TTS/cs_api.py b/TTS/cs_api.py deleted file mode 100644 index 9dc6c30dd4..0000000000 --- a/TTS/cs_api.py +++ /dev/null @@ -1,317 +0,0 @@ -import http.client -import json -import os -import tempfile -import urllib.request -from typing import Tuple - -import numpy as np -import requests -from scipy.io import wavfile - -from TTS.utils.audio.numpy_transforms import save_wav - - -class Speaker(object): - """Convert dict to object.""" - - def __init__(self, d, is_voice=False): - self.is_voice = is_voice - for k, v in d.items(): - if isinstance(k, (list, tuple)): - setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v]) - else: - setattr(self, k, Speaker(v) if isinstance(v, dict) else v) - - def __repr__(self): - return str(self.__dict__) - - -class CS_API: - """🐸Coqui Studio API Wrapper. - - 🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice - interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different - characteristics. You can use these voices to generate new audio files or use them in your applications. - You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token. - You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from - https://app.coqui.ai/account. We can either enter the token as an environment variable as - `export COQUI_STUDIO_TOKEN=` or pass it as `CS_API(api_token=)`. - Visit https://app.coqui.ai/api for more information. - - - Args: - api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable - `COQUI_STUDIO_TOKEN`. - model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`. Default is `XTTS`. - - - Example listing all available speakers: - >>> from TTS.api import CS_API - >>> tts = CS_API() - >>> tts.speakers - - Example listing all emotions: - >>> # emotions are only available for `V1` model - >>> from TTS.api import CS_API - >>> tts = CS_API(model="V1") - >>> tts.emotions - - Example with a built-in 🐸 speaker: - >>> from TTS.api import CS_API - >>> tts = CS_API() - >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name) - >>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav") - - Example with multi-language model: - >>> from TTS.api import CS_API - >>> tts = CS_API(model="XTTS") - >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en") - """ - - MODEL_ENDPOINTS = { - "V1": { - "list_speakers": "https://app.coqui.ai/api/v2/speakers", - "synthesize": "https://app.coqui.ai/api/v2/samples", - "list_voices": "https://app.coqui.ai/api/v2/voices", - }, - "XTTS": { - "list_speakers": "https://app.coqui.ai/api/v2/speakers", - "synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/", - "list_voices": "https://app.coqui.ai/api/v2/voices/xtts", - }, - } - - SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"] - - def __init__(self, api_token=None, model="XTTS"): - self.api_token = api_token - self.model = model - self.headers = None - self._speakers = None - self._check_token() - - @staticmethod - def ping_api(): - URL = "https://coqui.gateway.scarf.sh/tts/api" - _ = requests.get(URL) - - @property - def speakers(self): - if self._speakers is None: - self._speakers = self.list_all_speakers() - return self._speakers - - @property - def emotions(self): - """Return a list of available emotions. - - TODO: Get this from the API endpoint. - """ - if self.model == "V1": - return ["Neutral", "Happy", "Sad", "Angry", "Dull"] - else: - raise ValueError(f"❗ Emotions are not available for {self.model}.") - - def _check_token(self): - if self.api_token is None: - self.api_token = os.environ.get("COQUI_STUDIO_TOKEN") - self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"} - if not self.api_token: - raise ValueError( - "No API token found for 🐸Coqui Studio voices - https://coqui.ai \n" - "Visit 🔗https://app.coqui.ai/account to get one.\n" - "Set it as an environment variable `export COQUI_STUDIO_TOKEN=`\n" - "" - ) - - def list_all_speakers(self): - """Return both built-in Coqui Studio speakers and custom voices created by the user.""" - return self.list_speakers() + self.list_voices() - - def list_speakers(self): - """List built-in Coqui Studio speakers.""" - self._check_token() - conn = http.client.HTTPSConnection("app.coqui.ai") - url = self.MODEL_ENDPOINTS[self.model]["list_speakers"] - conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers) - res = conn.getresponse() - data = res.read() - return [Speaker(s) for s in json.loads(data)["result"]] - - def list_voices(self): - """List custom voices created by the user.""" - conn = http.client.HTTPSConnection("app.coqui.ai") - url = self.MODEL_ENDPOINTS[self.model]["list_voices"] - conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers) - res = conn.getresponse() - data = res.read() - return [Speaker(s, True) for s in json.loads(data)["result"]] - - def list_speakers_as_tts_models(self): - """List speakers in ModelManager format.""" - models = [] - for speaker in self.speakers: - model = f"coqui_studio/multilingual/{speaker.name}/{self.model}" - models.append(model) - return models - - def name_to_speaker(self, name): - for speaker in self.speakers: - if speaker.name == name: - return speaker - raise ValueError(f"Speaker {name} not found in {self.speakers}") - - def id_to_speaker(self, speaker_id): - for speaker in self.speakers: - if speaker.id == speaker_id: - return speaker - raise ValueError(f"Speaker {speaker_id} not found.") - - @staticmethod - def url_to_np(url): - tmp_file, _ = urllib.request.urlretrieve(url) - rate, data = wavfile.read(tmp_file) - return data, rate - - @staticmethod - def _create_payload(model, text, speaker, speed, emotion, language): - payload = {} - # if speaker.is_voice: - payload["voice_id"] = speaker.id - # else: - payload["speaker_id"] = speaker.id - - if model == "V1": - payload.update( - { - "emotion": emotion, - "name": speaker.name, - "text": text, - "speed": speed, - } - ) - elif model == "XTTS": - payload.update( - { - "name": speaker.name, - "text": text, - "speed": speed, - "language": language, - } - ) - else: - raise ValueError(f"❗ Unknown model {model}") - return payload - - def _check_tts_args(self, text, speaker_name, speaker_id, emotion, speed, language): - assert text is not None, "❗ text is required for V1 model." - assert speaker_name is not None, "❗ speaker_name is required for V1 model." - if self.model == "V1": - if emotion is None: - emotion = "Neutral" - assert language is None, "❗ language is not supported for V1 model." - elif self.model == "XTTS": - assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model." - assert language is not None, "❗ Language is required for XTTS model." - assert ( - language in self.SUPPORTED_LANGUAGES - ), f"❗ Language {language} is not yet supported. Check https://docs.coqui.ai/reference/samples_xtts_create." - return text, speaker_name, speaker_id, emotion, speed, language - - def tts( - self, - text: str, - speaker_name: str = None, - speaker_id=None, - emotion=None, - speed=1.0, - language=None, # pylint: disable=unused-argument - ) -> Tuple[np.ndarray, int]: - """Synthesize speech from text. - - Args: - text (str): Text to synthesize. - speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and - voices (user generated speakers) with `list_voices()`. - speaker_id (str): Speaker ID. If None, the speaker name is used. - emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only - supported by `V1` model. Defaults to None. - speed (float): Speed of the speech. 1.0 is normal speed. - language (str): Language of the text. If None, the default language of the speaker is used. Language is only - supported by `XTTS` model. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages. - """ - self._check_token() - self.ping_api() - - if speaker_name is None and speaker_id is None: - raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.") - if speaker_id is None: - speaker = self.name_to_speaker(speaker_name) - else: - speaker = self.id_to_speaker(speaker_id) - - text, speaker_name, speaker_id, emotion, speed, language = self._check_tts_args( - text, speaker_name, speaker_id, emotion, speed, language - ) - - conn = http.client.HTTPSConnection("app.coqui.ai") - payload = self._create_payload(self.model, text, speaker, speed, emotion, language) - url = self.MODEL_ENDPOINTS[self.model]["synthesize"] - conn.request("POST", url, json.dumps(payload), self.headers) - res = conn.getresponse() - data = res.read() - try: - wav, sr = self.url_to_np(json.loads(data)["audio_url"]) - except KeyError as e: - raise ValueError(f" [!] 🐸 API returned error: {data}") from e - return wav, sr - - def tts_to_file( - self, - text: str, - speaker_name: str, - speaker_id=None, - emotion=None, - speed=1.0, - pipe_out=None, - language=None, - file_path: str = None, - ) -> str: - """Synthesize speech from text and save it to a file. - - Args: - text (str): Text to synthesize. - speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and - voices (user generated speakers) with `list_voices()`. - speaker_id (str): Speaker ID. If None, the speaker name is used. - emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". - speed (float): Speed of the speech. 1.0 is normal speed. - pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. - language (str): Language of the text. If None, the default language of the speaker is used. Language is only - supported by `XTTS` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". - file_path (str): Path to save the file. If None, a temporary file is created. - """ - if file_path is None: - file_path = tempfile.mktemp(".wav") - wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language) - save_wav(wav=wav, path=file_path, sample_rate=sr, pipe_out=pipe_out) - return file_path - - -if __name__ == "__main__": - import time - - api = CS_API() - print(api.speakers) - print(api.list_speakers_as_tts_models()) - - ts = time.time() - wav, sr = api.tts( - "It took me quite a long time to develop a voice.", language="en", speaker_name=api.speakers[0].name - ) - print(f" [i] XTTS took {time.time() - ts:.2f}s") - - filepath = api.tts_to_file( - text="Hello world!", speaker_name=api.speakers[0].name, language="en", file_path="output.wav" - ) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index ed7cb2cc07..59dcb58ff0 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -68,28 +68,6 @@ def read_models_file(self, file_path): with open(file_path, "r", encoding="utf-8") as json_file: self.models_dict = json.load(json_file) - def add_cs_api_models(self, model_list: List[str]): - """Add list of Coqui Studio model names that are returned from the api - - Each has the following format `/en//` - """ - - def _add_model(model_name: str): - if not "coqui_studio" in model_name: - return - model_type, lang, dataset, model = model_name.split("/") - if model_type not in self.models_dict: - self.models_dict[model_type] = {} - if lang not in self.models_dict[model_type]: - self.models_dict[model_type][lang] = {} - if dataset not in self.models_dict[model_type][lang]: - self.models_dict[model_type][lang][dataset] = {} - if model not in self.models_dict[model_type][lang][dataset]: - self.models_dict[model_type][lang][dataset][model] = {} - - for model_name in model_list: - _add_model(model_name) - def _list_models(self, model_type, model_count=0): if self.verbose: print("\n Name format: type/language/dataset/model") diff --git a/docs/source/inference.md b/docs/source/inference.md index 611a2445bf..56bccfb5b2 100644 --- a/docs/source/inference.md +++ b/docs/source/inference.md @@ -172,48 +172,6 @@ tts.tts_with_vc_to_file( ) ``` -#### Example text to speech using [🐸Coqui Studio](https://coqui.ai) models. - -You can use all of your available speakers in the studio. -[🐸Coqui Studio](https://coqui.ai) API token is required. You can get it from the [account page](https://coqui.ai/account). -You should set the `COQUI_STUDIO_TOKEN` environment variable to use the API token. - -```python -# If you have a valid API token set you will see the studio speakers as separate models in the list. -# The name format is coqui_studio/en//coqui_studio -models = TTS().list_models() -# Init TTS with the target studio speaker -tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False) -# Run TTS -tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH) -# Run TTS with emotion and speed control -tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5) -``` - -If you just need 🐸 Coqui Studio speakers, you can use `CS_API`. It is a wrapper around the 🐸 Coqui Studio API. - -```python -from TTS.api import CS_API - -# Init 🐸 Coqui Studio API -# you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument. - -# XTTS - Best quality and life-like speech in multiple languages. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages. -api = CS_API(api_token=, model="XTTS") -api.speakers # all the speakers are available with all the models. -api.list_speakers() -api.list_voices() -wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", language="en", speed=1.5) - -# V1 - Fast and lightweight TTS in EN with emotion control. -api = CS_API(api_token=, model="V1") -api.speakers -api.emotions # emotions are only for the V1 model. -api.list_speakers() -api.list_voices() -wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5) -``` - #### Example text to speech using **Fairseq models in ~1100 languages** 🤯. For these models use the following name format: `tts_models//fairseq/vits`. diff --git a/tests/api_tests/__init__.py b/tests/api_tests/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/api_tests/test_python_api.py b/tests/api_tests/test_python_api.py deleted file mode 100644 index 2025fcd9c6..0000000000 --- a/tests/api_tests/test_python_api.py +++ /dev/null @@ -1,113 +0,0 @@ -import os -import unittest - -from tests import get_tests_data_path, get_tests_output_path -from TTS.api import CS_API, TTS - -OUTPUT_PATH = os.path.join(get_tests_output_path(), "test_python_api.wav") -cloning_test_wav_path = os.path.join(get_tests_data_path(), "ljspeech/wavs/LJ001-0028.wav") - - -is_coqui_available = os.environ.get("COQUI_STUDIO_TOKEN") - - -if is_coqui_available: - - class CS_APITest(unittest.TestCase): - def test_speakers(self): - tts = CS_API() - self.assertGreater(len(tts.speakers), 1) - - def test_emotions(self): - tts = CS_API() - self.assertGreater(len(tts.emotions), 1) - - def test_list_calls(self): - tts = CS_API() - self.assertGreater(len(tts.list_voices()), 1) - self.assertGreater(len(tts.list_speakers()), 1) - self.assertGreater(len(tts.list_all_speakers()), 1) - self.assertGreater(len(tts.list_speakers_as_tts_models()), 1) - - def test_name_to_speaker(self): - tts = CS_API() - speaker_name = tts.list_speakers_as_tts_models()[0].split("/")[2] - speaker = tts.name_to_speaker(speaker_name) - self.assertEqual(speaker.name, speaker_name) - - def test_tts(self): - tts = CS_API() - wav, sr = tts.tts(text="This is a test.", speaker_name=tts.list_speakers()[0].name) - self.assertEqual(sr, 44100) - self.assertGreater(len(wav), 1) - - class TTSTest(unittest.TestCase): - def test_single_speaker_model(self): - tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False) - - error_raised = False - try: - tts.tts_to_file(text="Ich bin eine Testnachricht.", speaker="Thorsten", language="de") - except ValueError: - error_raised = True - - tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH) - - self.assertTrue(error_raised) - self.assertFalse(tts.is_multi_speaker) - self.assertFalse(tts.is_multi_lingual) - self.assertIsNone(tts.speakers) - self.assertIsNone(tts.languages) - - def test_studio_model(self): - tts = TTS(model_name="coqui_studio/en/Zacharie Aimilios/coqui_studio") - tts.tts_to_file(text="This is a test.") - - # check speed > 2.0 raises error - raised_error = False - try: - _ = tts.tts(text="This is a test.", speed=4.0, emotion="Sad") # should raise error with speed > 2.0 - except ValueError: - raised_error = True - self.assertTrue(raised_error) - - # check emotion is invalid - raised_error = False - try: - _ = tts.tts(text="This is a test.", speed=2.0, emotion="No Emo") # should raise error with speed > 2.0 - except ValueError: - raised_error = True - self.assertTrue(raised_error) - - # check valid call - wav = tts.tts(text="This is a test.", speed=2.0, emotion="Sad") - self.assertGreater(len(wav), 0) - - def test_fairseq_model(self): # pylint: disable=no-self-use - tts = TTS(model_name="tts_models/eng/fairseq/vits") - tts.tts_to_file(text="This is a test.") - - def test_multi_speaker_multi_lingual_model(self): - tts = TTS() - tts.load_tts_model_by_name(tts.models[0]) # YourTTS - tts.tts_to_file( - text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path=OUTPUT_PATH - ) - - self.assertTrue(tts.is_multi_speaker) - self.assertTrue(tts.is_multi_lingual) - self.assertGreater(len(tts.speakers), 1) - self.assertGreater(len(tts.languages), 1) - - def test_voice_cloning(self): # pylint: disable=no-self-use - tts = TTS() - tts.load_tts_model_by_name("tts_models/multilingual/multi-dataset/your_tts") - tts.tts_to_file("Hello world!", speaker_wav=cloning_test_wav_path, language="en", file_path=OUTPUT_PATH) - - def test_voice_conversion(self): # pylint: disable=no-self-use - tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=False) - tts.voice_conversion_to_file( - source_wav=cloning_test_wav_path, - target_wav=cloning_test_wav_path, - file_path=OUTPUT_PATH, - ) diff --git a/tests/api_tests/test_synthesize_api.py b/tests/api_tests/test_synthesize_api.py deleted file mode 100644 index e7b4f12048..0000000000 --- a/tests/api_tests/test_synthesize_api.py +++ /dev/null @@ -1,25 +0,0 @@ -import os - -from tests import get_tests_output_path, run_cli - - -def test_synthesize(): - """Test synthesize.py with diffent arguments.""" - output_path = os.path.join(get_tests_output_path(), "output.wav") - - # 🐸 Coqui studio model - run_cli( - 'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" ' - '--text "This is it" ' - f'--out_path "{output_path}"' - ) - - # 🐸 Coqui studio model with speed arg. - run_cli( - 'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" ' - '--text "This is it but slow" --speed 0.1' - f'--out_path "{output_path}"' - ) - - # test pipe_out command - run_cli(f'tts --text "test." --pipe_out --out_path "{output_path}" | aplay')