From 3a104d5c49e6f9063e32ba5fed8e34f1dce25887 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 13 Aug 2023 12:04:12 +0200 Subject: [PATCH] Update Studio API for XTTS (#2861) * Update Studio API for XTTS * Update the docs * Update README.md * Update README.md Update README --- README.md | 54 ++++-- TTS/api.py | 249 ++------------------------- TTS/bin/synthesize.py | 19 ++- TTS/cs_api.py | 338 +++++++++++++++++++++++++++++++++++++ TTS/tts/models/tortoise.py | 8 +- TTS/utils/manage.py | 2 +- docs/source/inference.md | 20 ++- 7 files changed, 432 insertions(+), 258 deletions(-) create mode 100644 TTS/cs_api.py diff --git a/README.md b/README.md index bf1bf2c062..834208406c 100644 --- a/README.md +++ b/README.md @@ -108,7 +108,7 @@ Underlined "TTS*" and "Judy*" are **internal** 🐸TTS models that are not relea - Capacitron: [paper](https://arxiv.org/abs/1906.03402) - OverFlow: [paper](https://arxiv.org/abs/2211.06892) - Neural HMM TTS: [paper](https://arxiv.org/abs/2108.13320) -- Delightful TTS: [paper](https://arxiv.org/abs/2110.12612) +- Delightful TTS: [paper](https://arxiv.org/abs/2110.12612) ### End-to-End Models - VITS: [paper](https://arxiv.org/pdf/2106.06103) @@ -204,9 +204,11 @@ tts = TTS(model_name) wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0]) # Text to speech to a file tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav") +``` -# Running a single speaker model +#### Running a single speaker model +```python # Init TTS with the target model name tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False) # Run TTS @@ -218,15 +220,21 @@ tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_ tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav") tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav") tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt-br", file_path="output.wav") +``` +#### Example voice conversion -# Example voice conversion converting speaker of the `source_wav` to the speaker of the `target_wav` +Converting the voice in `source_wav` to the voice of `target_wav` +```python tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=True) tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav") +``` + +#### Example voice cloning together with the voice conversion model. +This way, you can clone voices by using any model in 🐸TTS. -# Example voice cloning by a single speaker TTS model combining with the voice conversion model. This way, you can -# clone voices by using any model in 🐸TTS. +```python tts = TTS("tts_models/de/thorsten/tacotron2-DDC") tts.tts_with_vc_to_file( @@ -234,29 +242,43 @@ tts.tts_with_vc_to_file( speaker_wav="target/speaker.wav", file_path="output.wav" ) +``` -# Example text to speech using [🐸Coqui Studio](https://coqui.ai) models. +#### Example using [🐸Coqui Studio](https://coqui.ai) voices. +You access all of your cloned voices and built-in speakers in [🐸Coqui Studio](https://coqui.ai). +To do this, you'll need an API token, which you can obtain from the [account page](https://coqui.ai/account). +After obtaining the API token, you'll need to configure the COQUI_STUDIO_TOKEN environment variable. -# You can use all of your available speakers in the studio. -# [🐸Coqui Studio](https://coqui.ai) API token is required. You can get it from the [account page](https://coqui.ai/account). -# You should set the `COQUI_STUDIO_TOKEN` environment variable to use the API token. +Once you have a valid API token in place, the studio speakers will be displayed as distinct models within the list. +These models will follow the naming convention `coqui_studio/en//coqui_studio` -# If you have a valid API token set you will see the studio speakers as separate models in the list. -# The name format is coqui_studio/en//coqui_studio -models = TTS().list_models() +```python +# XTTS model +models = TTS(cs_api_model="XTTS").list_models() # Init TTS with the target studio speaker tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False, gpu=False) # Run TTS tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH) + +# V1 model +models = TTS(cs_api_model="V1").list_models() # Run TTS with emotion and speed control +# Emotion control only works with V1 model tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5) +# XTTS-multilingual +models = TTS(cs_api_model="XTTS-multilingual").list_models() +# Run TTS with emotion and speed control +# Emotion control only works with V1 model +tts.tts_to_file(text="Das ist ein Test.", file_path=OUTPUT_PATH, language="de", speed=1.0) +``` -#Example text to speech using **Fairseq models in ~1100 languages** 🤯. - -#For these models use the following name format: `tts_models//fairseq/vits`. -#You can find the list of language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms). +#### Example text to speech using **Fairseq models in ~1100 languages** 🤯. +For Fairseq models, use the following name format: `tts_models//fairseq/vits`. +You can find the language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) +and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms). +```python # TTS with on the fly voice conversion api = TTS("tts_models/deu/fairseq/vits") api.tts_with_vc_to_file( diff --git a/TTS/api.py b/TTS/api.py index 861c541826..5bb91362a3 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -1,234 +1,15 @@ -import http.client -import json -import os import tempfile -import urllib.request from pathlib import Path -from typing import Tuple, Union +from typing import Union import numpy as np -import requests -from scipy.io import wavfile +from TTS.cs_api import CS_API from TTS.utils.audio.numpy_transforms import save_wav from TTS.utils.manage import ModelManager from TTS.utils.synthesizer import Synthesizer -class Speaker(object): - """Convert dict to object.""" - - def __init__(self, d, is_voice=False): - self.is_voice = is_voice - for k, v in d.items(): - if isinstance(k, (list, tuple)): - setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v]) - else: - setattr(self, k, Speaker(v) if isinstance(v, dict) else v) - - def __repr__(self): - return str(self.__dict__) - - -class CS_API: - """🐸Coqui Studio API Wrapper. - - 🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice - interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different - characteristics. You can use these voices to generate new audio files or use them in your applications. - You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token. - You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from - https://app.coqui.ai/account. We can either enter the token as an environment variable as - `export COQUI_STUDIO_TOKEN=` or pass it as `CS_API(api_token=)`. - Visit https://app.coqui.ai/api for more information. - - Example listing all available speakers: - >>> from TTS.api import CS_API - >>> tts = CS_API() - >>> tts.speakers - - Example listing all emotions: - >>> from TTS.api import CS_API - >>> tts = CS_API() - >>> tts.emotions - - Example with a built-in 🐸 speaker: - >>> from TTS.api import CS_API - >>> tts = CS_API() - >>> wav, sr = api.tts("Hello world", speaker_name="Claribel Dervla") - >>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav") - """ - - def __init__(self, api_token=None): - self.api_token = api_token - self.api_prefix = "/api/v2" - self.headers = None - self._speakers = None - self._check_token() - - @staticmethod - def ping_api(): - URL = "https://coqui.gateway.scarf.sh/tts/api" - _ = requests.get(URL) - - @property - def speakers(self): - if self._speakers is None: - self._speakers = self.list_all_speakers() - return self._speakers - - @property - def emotions(self): - """Return a list of available emotions. - - TODO: Get this from the API endpoint. - """ - return ["Neutral", "Happy", "Sad", "Angry", "Dull"] - - def _check_token(self): - if self.api_token is None: - self.api_token = os.environ.get("COQUI_STUDIO_TOKEN") - self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"} - if not self.api_token: - raise ValueError( - "No API token found for 🐸Coqui Studio voices - https://coqui.ai \n" - "Visit 🔗https://app.coqui.ai/account to get one.\n" - "Set it as an environment variable `export COQUI_STUDIO_TOKEN=`\n" - "" - ) - - def list_all_speakers(self): - """Return both built-in Coqui Studio speakers and custom voices created by the user.""" - return self.list_speakers() + self.list_voices() - - def list_speakers(self): - """List built-in Coqui Studio speakers.""" - self._check_token() - conn = http.client.HTTPSConnection("app.coqui.ai") - conn.request("GET", f"{self.api_prefix}/speakers?per_page=100", headers=self.headers) - res = conn.getresponse() - data = res.read() - return [Speaker(s) for s in json.loads(data)["result"]] - - def list_voices(self): - """List custom voices created by the user.""" - conn = http.client.HTTPSConnection("app.coqui.ai") - conn.request("GET", f"{self.api_prefix}/voices", headers=self.headers) - res = conn.getresponse() - data = res.read() - return [Speaker(s, True) for s in json.loads(data)["result"]] - - def list_speakers_as_tts_models(self): - """List speakers in ModelManager format.""" - models = [] - for speaker in self.speakers: - model = f"coqui_studio/en/{speaker.name}/coqui_studio" - models.append(model) - return models - - def name_to_speaker(self, name): - for speaker in self.speakers: - if speaker.name == name: - return speaker - raise ValueError(f"Speaker {name} not found in {self.speakers}") - - def id_to_speaker(self, speaker_id): - for speaker in self.speakers: - if speaker.id == speaker_id: - return speaker - raise ValueError(f"Speaker {speaker_id} not found.") - - @staticmethod - def url_to_np(url): - tmp_file, _ = urllib.request.urlretrieve(url) - rate, data = wavfile.read(tmp_file) - return data, rate - - @staticmethod - def _create_payload(text, speaker, emotion, speed): - payload = {} - if speaker.is_voice: - payload["voice_id"] = speaker.id - else: - payload["speaker_id"] = speaker.id - payload.update( - { - "emotion": emotion, - "name": speaker.name, - "text": text, - "speed": speed, - } - ) - return payload - - def tts( - self, - text: str, - speaker_name: str = None, - speaker_id=None, - emotion="Neutral", - speed=1.0, - language=None, # pylint: disable=unused-argument - ) -> Tuple[np.ndarray, int]: - """Synthesize speech from text. - - Args: - text (str): Text to synthesize. - speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and - voices (user generated speakers) with `list_voices()`. - speaker_id (str): Speaker ID. If None, the speaker name is used. - emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". - speed (float): Speed of the speech. 1.0 is normal speed. - language (str): Language of the text. If None, the default language of the speaker is used. - """ - self._check_token() - self.ping_api() - if speaker_name is None and speaker_id is None: - raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.") - if speaker_id is None: - speaker = self.name_to_speaker(speaker_name) - else: - speaker = self.id_to_speaker(speaker_id) - conn = http.client.HTTPSConnection("app.coqui.ai") - payload = self._create_payload(text, speaker, emotion, speed) - conn.request("POST", "/api/v2/samples", json.dumps(payload), self.headers) - res = conn.getresponse() - data = res.read() - try: - wav, sr = self.url_to_np(json.loads(data)["audio_url"]) - except KeyError as e: - raise ValueError(f" [!] 🐸 API returned error: {data}") from e - return wav, sr - - def tts_to_file( - self, - text: str, - speaker_name: str, - speaker_id=None, - emotion="Neutral", - speed=1.0, - language=None, - file_path: str = None, - ) -> str: - """Synthesize speech from text and save it to a file. - - Args: - text (str): Text to synthesize. - speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and - voices (user generated speakers) with `list_voices()`. - speaker_id (str): Speaker ID. If None, the speaker name is used. - emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". - speed (float): Speed of the speech. 1.0 is normal speed. - language (str): Language of the text. If None, the default language of the speaker is used. - file_path (str): Path to save the file. If None, a temporary file is created. - """ - if file_path is None: - file_path = tempfile.mktemp(".wav") - wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language) - wavfile.write(file_path, sr, wav) - return file_path - - class TTS: """TODO: Add voice conversion and Capacitron support.""" @@ -240,6 +21,7 @@ def __init__( vocoder_path: str = None, vocoder_config_path: str = None, progress_bar: bool = True, + cs_api_model: str = "XTTS", gpu=False, ): """🐸TTS python interface that allows to load and use the released models. @@ -275,6 +57,9 @@ def __init__( vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None. vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None. progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True. + cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are + "XTTS", "XTTS-multilingual", "V1". You can also use `TTS.cs_api.CS_API" for more control. + Defaults to "XTTS". gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. """ self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False) @@ -282,6 +67,7 @@ def __init__( self.synthesizer = None self.voice_converter = None self.csapi = None + self.cs_api_model = cs_api_model self.model_name = None if model_name is not None: @@ -333,10 +119,9 @@ def languages(self): def get_models_file_path(): return Path(__file__).parent / ".models.json" - @staticmethod - def list_models(): + def list_models(self): try: - csapi = CS_API() + csapi = CS_API(model=self.cs_api_model) models = csapi.list_speakers_as_tts_models() except ValueError as e: print(e) @@ -468,7 +253,7 @@ def tts_coqui_studio( text: str, speaker_name: str = None, language: str = None, - emotion: str = "Neutral", + emotion: str = None, speed: float = 1.0, file_path: str = None, ) -> Union[np.ndarray, str]: @@ -479,10 +264,11 @@ def tts_coqui_studio( Input text to synthesize. speaker_name (str, optional): Speaker name from Coqui Studio. Defaults to None. - language (str, optional): - Language code. Coqui Studio currently supports only English. Defaults to None. + language (str): Language of the text. If None, the default language of the speaker is used. Language is only + supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". emotion (str, optional): - Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Defaults to "Neutral". + Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available + with "V1" model. Defaults to None. speed (float, optional): Speed of the speech. Defaults to 1.0. file_path (str, optional): @@ -521,9 +307,8 @@ def tts( speaker (str, optional): Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. - language (str, optional): - Language code for multi-lingual models. You can check whether loaded model is multi-lingual - `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None. + language (str): Language of the text. If None, the default language of the speaker is used. Language is only + supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". speaker_wav (str, optional): Path to a reference wav file to use for voice cloning with supporting models like YourTTS. Defaults to None. @@ -559,7 +344,7 @@ def tts_to_file( speaker: str = None, language: str = None, speaker_wav: str = None, - emotion: str = "Neutral", + emotion: str = None, speed: float = 1.0, file_path: str = "output.wav", **kwargs, diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index dbe7f99856..d4350cd5e8 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -185,11 +185,22 @@ def main(): parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None) # args for coqui studio + parser.add_argument( + "--cs_model", + type=str, + help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `XTTS-multilingual`, `V1`.", + ) parser.add_argument( "--emotion", type=str, - help="Emotion to condition the model with. Only available for 🐸Coqui Studio models.", - default="Neutral", + help="Emotion to condition the model with. Only available for 🐸Coqui Studio `V1` model.", + default=None, + ) + parser.add_argument( + "--language", + type=str, + help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.", + default=None, ) # args for multi-speaker synthesis @@ -335,8 +346,8 @@ def main(): # CASE3: TTS with coqui studio models if "coqui_studio" in args.model_name: print(" > Using 🐸Coqui Studio model: ", args.model_name) - api = TTS(model_name=args.model_name) - api.tts_to_file(text=args.text, emotion=args.emotion, file_path=args.out_path) + api = TTS(model_name=args.model_name, cs_api_model=args.cs_model) + api.tts_to_file(text=args.text, emotion=args.emotion, file_path=args.out_path, language=args.language) print(" > Saving output to ", args.out_path) return diff --git a/TTS/cs_api.py b/TTS/cs_api.py new file mode 100644 index 0000000000..a36452abc9 --- /dev/null +++ b/TTS/cs_api.py @@ -0,0 +1,338 @@ +import http.client +import json +import os +import tempfile +import urllib.request +from typing import Tuple + +import numpy as np +import requests +from scipy.io import wavfile + + +class Speaker(object): + """Convert dict to object.""" + + def __init__(self, d, is_voice=False): + self.is_voice = is_voice + for k, v in d.items(): + if isinstance(k, (list, tuple)): + setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v]) + else: + setattr(self, k, Speaker(v) if isinstance(v, dict) else v) + + def __repr__(self): + return str(self.__dict__) + + +class CS_API: + """🐸Coqui Studio API Wrapper. + + 🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice + interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different + characteristics. You can use these voices to generate new audio files or use them in your applications. + You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token. + You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from + https://app.coqui.ai/account. We can either enter the token as an environment variable as + `export COQUI_STUDIO_TOKEN=` or pass it as `CS_API(api_token=)`. + Visit https://app.coqui.ai/api for more information. + + + Args: + api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable + `COQUI_STUDIO_TOKEN`. + model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`, or `XTTS-multilang`. Default is `XTTS`. + + + Example listing all available speakers: + >>> from TTS.api import CS_API + >>> tts = CS_API() + >>> tts.speakers + + Example listing all emotions: + >>> # emotions are only available for `V1` model + >>> from TTS.api import CS_API + >>> tts = CS_API(model="V1") + >>> tts.emotions + + Example with a built-in 🐸 speaker: + >>> from TTS.api import CS_API + >>> tts = CS_API() + >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name) + >>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav") + + Example with multi-language model: + >>> from TTS.api import CS_API + >>> tts = CS_API(model="XTTS-multilang") + >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en") + """ + + MODEL_ENDPOINTS = { + "V1": { + "list_speakers": "https://app.coqui.ai/api/v2/speakers", + "synthesize": "https://app.coqui.ai/api/v2/samples", + "list_voices": "https://app.coqui.ai/api/v2/voices", + }, + "XTTS": { + "list_speakers": "https://app.coqui.ai/api/v2/speakers", + "synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/", + "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/", + }, + "XTTS-multilang": { + "list_speakers": "https://app.coqui.ai/api/v2/speakers", + "synthesize": "https://app.coqui.ai/api/v2/samples/multilingual/render/", + "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/", + }, + } + + SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl"] + + def __init__(self, api_token=None, model="XTTS"): + self.api_token = api_token + self.model = model + self.headers = None + self._speakers = None + self._check_token() + + @staticmethod + def ping_api(): + URL = "https://coqui.gateway.scarf.sh/tts/api" + _ = requests.get(URL) + + @property + def speakers(self): + if self._speakers is None: + self._speakers = self.list_all_speakers() + return self._speakers + + @property + def emotions(self): + """Return a list of available emotions. + + TODO: Get this from the API endpoint. + """ + if self.model == "V1": + return ["Neutral", "Happy", "Sad", "Angry", "Dull"] + else: + raise ValueError(f"❗ Emotions are not available for {self.model}.") + + def _check_token(self): + if self.api_token is None: + self.api_token = os.environ.get("COQUI_STUDIO_TOKEN") + self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"} + if not self.api_token: + raise ValueError( + "No API token found for 🐸Coqui Studio voices - https://coqui.ai \n" + "Visit 🔗https://app.coqui.ai/account to get one.\n" + "Set it as an environment variable `export COQUI_STUDIO_TOKEN=`\n" + "" + ) + + def list_all_speakers(self): + """Return both built-in Coqui Studio speakers and custom voices created by the user.""" + return self.list_speakers() + self.list_voices() + + def list_speakers(self): + """List built-in Coqui Studio speakers.""" + self._check_token() + conn = http.client.HTTPSConnection("app.coqui.ai") + url = self.MODEL_ENDPOINTS[self.model]["list_speakers"] + conn.request("GET", f"{url}?per_page=100", headers=self.headers) + res = conn.getresponse() + data = res.read() + return [Speaker(s) for s in json.loads(data)["result"]] + + def list_voices(self): + """List custom voices created by the user.""" + conn = http.client.HTTPSConnection("app.coqui.ai") + url = self.MODEL_ENDPOINTS[self.model]["list_voices"] + conn.request("GET", f"{url}", headers=self.headers) + res = conn.getresponse() + data = res.read() + return [Speaker(s, True) for s in json.loads(data)["result"]] + + def list_speakers_as_tts_models(self): + """List speakers in ModelManager format.""" + models = [] + for speaker in self.speakers: + model = f"coqui_studio/multilingual/{speaker.name}/{self.model}" + models.append(model) + return models + + def name_to_speaker(self, name): + for speaker in self.speakers: + if speaker.name == name: + return speaker + raise ValueError(f"Speaker {name} not found in {self.speakers}") + + def id_to_speaker(self, speaker_id): + for speaker in self.speakers: + if speaker.id == speaker_id: + return speaker + raise ValueError(f"Speaker {speaker_id} not found.") + + @staticmethod + def url_to_np(url): + tmp_file, _ = urllib.request.urlretrieve(url) + rate, data = wavfile.read(tmp_file) + return data, rate + + @staticmethod + def _create_payload(model, text, speaker, speed, emotion, language): + payload = {} + # if speaker.is_voice: + payload["voice_id"] = speaker.id + # else: + payload["speaker_id"] = speaker.id + + if model == "V1": + payload.update( + { + "emotion": emotion, + "name": speaker.name, + "text": text, + "speed": speed, + } + ) + elif model == "XTTS": + payload.update( + { + "name": speaker.name, + "text": text, + "speed": speed, + } + ) + elif model == "XTTS-multilang": + payload.update( + { + "name": speaker.name, + "text": text, + "speed": speed, + "language": language, + } + ) + else: + raise ValueError(f"❗ Unknown model {model}") + return payload + + def _check_tts_args(self, text, speaker_name, speaker_id, emotion, speed, language): + assert text is not None, "❗ text is required for V1 model." + assert speaker_name is not None, "❗ speaker_name is required for V1 model." + if self.model == "V1": + if emotion is None: + emotion = "Neutral" + assert language is None, "❗ language is not supported for V1 model." + elif self.model == "XTTS": + assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model." + assert language is None, "❗ Language is not supported for XTTS model. Use XTTS-multilang model." + elif self.model == "XTTS-multilang": + assert emotion is None, f"❗ Emotions are not supported for XTTS-multilang model. Use V1 model." + assert language is not None, "❗ Language is required for XTTS-multilang model." + assert ( + language in self.SUPPORTED_LANGUAGES + ), f"❗ Language {language} is not yet supported. Use one of: en, es, de, fr, it, pt, pl" + return text, speaker_name, speaker_id, emotion, speed, language + + def tts( + self, + text: str, + speaker_name: str = None, + speaker_id=None, + emotion=None, + speed=1.0, + language=None, # pylint: disable=unused-argument + ) -> Tuple[np.ndarray, int]: + """Synthesize speech from text. + + Args: + text (str): Text to synthesize. + speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and + voices (user generated speakers) with `list_voices()`. + speaker_id (str): Speaker ID. If None, the speaker name is used. + emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only + supported by `V1` model. Defaults to None. + speed (float): Speed of the speech. 1.0 is normal speed. + language (str): Language of the text. If None, the default language of the speaker is used. Language is only + supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". + """ + self._check_token() + self.ping_api() + + if speaker_name is None and speaker_id is None: + raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.") + if speaker_id is None: + speaker = self.name_to_speaker(speaker_name) + else: + speaker = self.id_to_speaker(speaker_id) + + text, speaker_name, speaker_id, emotion, speed, language = self._check_tts_args( + text, speaker_name, speaker_id, emotion, speed, language + ) + + conn = http.client.HTTPSConnection("app.coqui.ai") + payload = self._create_payload(self.model, text, speaker, speed, emotion, language) + url = self.MODEL_ENDPOINTS[self.model]["synthesize"] + conn.request("POST", url, json.dumps(payload), self.headers) + res = conn.getresponse() + data = res.read() + try: + wav, sr = self.url_to_np(json.loads(data)["audio_url"]) + except KeyError as e: + raise ValueError(f" [!] 🐸 API returned error: {data}") from e + return wav, sr + + def tts_to_file( + self, + text: str, + speaker_name: str, + speaker_id=None, + emotion=None, + speed=1.0, + language=None, + file_path: str = None, + ) -> str: + """Synthesize speech from text and save it to a file. + + Args: + text (str): Text to synthesize. + speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and + voices (user generated speakers) with `list_voices()`. + speaker_id (str): Speaker ID. If None, the speaker name is used. + emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". + speed (float): Speed of the speech. 1.0 is normal speed. + language (str): Language of the text. If None, the default language of the speaker is used. Language is only + supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". + file_path (str): Path to save the file. If None, a temporary file is created. + """ + if file_path is None: + file_path = tempfile.mktemp(".wav") + wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language) + wavfile.write(file_path, sr, wav) + return file_path + + +if __name__ == "__main__": + import time + + api = CS_API() + print(api.speakers) + print(api.list_speakers_as_tts_models()) + + ts = time.time() + wav, sr = api.tts("It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name) + print(f" [i] XTTS took {time.time() - ts:.2f}s") + + filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav") + + api = CS_API(model="XTTS-multilang") + print(api.speakers) + + ts = time.time() + wav, sr = api.tts( + "It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name, language="en" + ) + print(f" [i] XTTS took {time.time() - ts:.2f}s") + + filepath = api.tts_to_file( + text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav", language="en" + ) diff --git a/TTS/tts/models/tortoise.py b/TTS/tts/models/tortoise.py index 2b140e56aa..16644ff95e 100644 --- a/TTS/tts/models/tortoise.py +++ b/TTS/tts/models/tortoise.py @@ -72,7 +72,7 @@ def load_discrete_vocoder_diffuser( ) -def format_conditioning(clip, cond_length=132300, device="cuda"): +def format_conditioning(clip, cond_length=132300, device="cuda", **kwargs): """ Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models. """ @@ -82,7 +82,7 @@ def format_conditioning(clip, cond_length=132300, device="cuda"): elif gap > 0: rand_start = random.randint(0, gap) clip = clip[:, rand_start : rand_start + cond_length] - mel_clip = TorchMelSpectrogram()(clip.unsqueeze(0)).squeeze(0) + mel_clip = TorchMelSpectrogram(**kwargs)(clip.unsqueeze(0)).squeeze(0) return mel_clip.unsqueeze(0).to(device) @@ -321,6 +321,7 @@ class Tortoise(BaseTTS): def __init__(self, config: Coqpit): super().__init__(config, ap=None, tokenizer=None) + self.mel_norm_path = None self.config = config self.ar_checkpoint = self.args.ar_checkpoint self.diff_checkpoint = self.args.diff_checkpoint # TODO: check if this is even needed @@ -429,7 +430,7 @@ def get_conditioning_latents( auto_conds = [] for ls in voice_samples: - auto_conds.append(format_conditioning(ls[0], device=self.device)) + auto_conds.append(format_conditioning(ls[0], device=self.device, mel_norm_file=self.mel_norm_path)) auto_conds = torch.stack(auto_conds, dim=1) with self.temporary_cuda(self.autoregressive) as ar: auto_latent = ar.get_conditioning(auto_conds) @@ -873,6 +874,7 @@ def load_checkpoint( diff_path = diff_checkpoint_path or os.path.join(checkpoint_dir, "diffusion_decoder.pth") clvp_path = clvp_checkpoint_path or os.path.join(checkpoint_dir, "clvp2.pth") vocoder_checkpoint_path = vocoder_checkpoint_path or os.path.join(checkpoint_dir, "vocoder.pth") + self.mel_norm_path = os.path.join(checkpoint_dir, "mel_norms.pth") if os.path.exists(ar_path): # remove keys from the checkpoint that are not in the model diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 29bdb24ae2..d6fbfb1624 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -88,7 +88,7 @@ def _add_model(model_name: str): def _list_models(self, model_type, model_count=0): if self.verbose: - print(" Name format: type/language/dataset/model") + print("\n Name format: type/language/dataset/model") model_list = [] for lang in self.models_dict[model_type]: for dataset in self.models_dict[model_type][lang]: diff --git a/docs/source/inference.md b/docs/source/inference.md index 3dd9232e59..b372efe365 100644 --- a/docs/source/inference.md +++ b/docs/source/inference.md @@ -191,9 +191,25 @@ from TTS.api import CS_API # Init 🐸 Coqui Studio API # you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument. -api = CS_API(api_token=) + +# XTTS - Best quality and life-like speech in EN +api = CS_API(api_token=, model="XTTS") +api.speakers # all the speakers are available with all the models. +api.list_speakers() +api.list_voices() +wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5) + +# XTTS-multilingual - Multilingual XTTS with [en, de, es, fr, it, pt, ...] (more langs coming soon) +api = CS_API(api_token=, model="XTTS-multilingual") +api.speakers +api.list_speakers() +api.list_voices() +wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5) + +# V1 - Fast and lightweight TTS in EN with emotion control. +api = CS_API(api_token=, model="V1") api.speakers -api.emotions +api.emotions # emotions are only for the V1 model. api.list_speakers() api.list_voices() wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)