coqui-ai · erogol · Nov 8, 2023 · Nov 8, 2023
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 ## 🐸Coqui.ai News
 - 📣 ⓍTTSv2 is here with 16 languages and better performance across the board.
 - 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech).
-- 📣 ⓍTTS can now stream with <200ms latency. 
+- 📣 ⓍTTS can now stream with <200ms latency.
 - 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html)
 - 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
 - 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
@@ -267,19 +267,13 @@ models = TTS(cs_api_model="XTTS").list_models()
 # Init TTS with the target studio speaker
 tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
 # Run TTS
-tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
+tts.tts_to_file(text="This is a test.", language="en", file_path=OUTPUT_PATH)
 
 # V1 model
 models = TTS(cs_api_model="V1").list_models()
 # Run TTS with emotion and speed control
 # Emotion control only works with V1 model
 tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
-
-# XTTS-multilingual
-models = TTS(cs_api_model="XTTS-multilingual").list_models()
-# Run TTS with emotion and speed control
-# Emotion control only works with V1 model
-tts.tts_to_file(text="Das ist ein Test.", file_path=OUTPUT_PATH, language="de", speed=1.0)
 ```
 
 #### Example text to speech using **Fairseq models in ~1100 languages** 🤯.

diff --git a/TTS/api.py b/TTS/api.py
@@ -60,7 +60,7 @@ def __init__(
             vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
             progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
             cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are
-                "XTTS", "XTTS-multilingual", "V1". You can also use `TTS.cs_api.CS_API" for more control.
+                "XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control.
                 Defaults to "XTTS".
             gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
         """
@@ -275,7 +275,7 @@ def tts_coqui_studio(
             speaker_name (str, optional):
                 Speaker name from Coqui Studio. Defaults to None.
             language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+                supported by `XTTS` model.
             emotion (str, optional):
                 Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available
                 with "V1" model. Defaults to None.
@@ -321,7 +321,7 @@ def tts(
                 Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
                 `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
             language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+                supported by `XTTS` model.
             speaker_wav (str, optional):
                 Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
                 Defaults to None.

diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
@@ -227,7 +227,7 @@ def main():
     parser.add_argument(
         "--cs_model",
         type=str,
-        help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `XTTS-multilingual`, `V1`.",
+        help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `V1`.",
     )
     parser.add_argument(
         "--emotion",
@@ -238,7 +238,7 @@ def main():
     parser.add_argument(
         "--language",
         type=str,
-        help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.",
+        help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS` model.",
         default=None,
     )
     parser.add_argument(

diff --git a/TTS/cs_api.py b/TTS/cs_api.py
@@ -43,7 +43,7 @@ class CS_API:
     Args:
         api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable
             `COQUI_STUDIO_TOKEN`.
-        model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`, or `XTTS-multilang`. Default is `XTTS`.
+        model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`. Default is `XTTS`.
 
 
     Example listing all available speakers:
@@ -65,7 +65,7 @@ class CS_API:
 
     Example with multi-language model:
         >>> from TTS.api import CS_API
-        >>> tts = CS_API(model="XTTS-multilang")
+        >>> tts = CS_API(model="XTTS")
         >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en")
     """
 
@@ -78,16 +78,12 @@ class CS_API:
         "XTTS": {
             "list_speakers": "https://app.coqui.ai/api/v2/speakers",
             "synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/",
-            "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
-        },
-        "XTTS-multilang": {
-            "list_speakers": "https://app.coqui.ai/api/v2/speakers",
-            "synthesize": "https://app.coqui.ai/api/v2/samples/multilingual/render/",
-            "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
+            "list_voices": "https://app.coqui.ai/api/v2/voices/xtts",
         },
     }
 
-    SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl"]
+
+    SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"]
 
     def __init__(self, api_token=None, model="XTTS"):
         self.api_token = api_token
@@ -139,7 +135,7 @@ def list_speakers(self):
         self._check_token()
         conn = http.client.HTTPSConnection("app.coqui.ai")
         url = self.MODEL_ENDPOINTS[self.model]["list_speakers"]
-        conn.request("GET", f"{url}?per_page=100", headers=self.headers)
+        conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
         res = conn.getresponse()
         data = res.read()
         return [Speaker(s) for s in json.loads(data)["result"]]
@@ -148,7 +144,7 @@ def list_voices(self):
         """List custom voices created by the user."""
         conn = http.client.HTTPSConnection("app.coqui.ai")
         url = self.MODEL_ENDPOINTS[self.model]["list_voices"]
-        conn.request("GET", f"{url}", headers=self.headers)
+        conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
         res = conn.getresponse()
         data = res.read()
         return [Speaker(s, True) for s in json.loads(data)["result"]]
@@ -197,14 +193,6 @@ def _create_payload(model, text, speaker, speed, emotion, language):
                 }
             )
         elif model == "XTTS":
-            payload.update(
-                {
-                    "name": speaker.name,
-                    "text": text,
-                    "speed": speed,
-                }
-            )
-        elif model == "XTTS-multilang":
             payload.update(
                 {
                     "name": speaker.name,
@@ -226,13 +214,10 @@ def _check_tts_args(self, text, speaker_name, speaker_id, emotion, speed, langua
             assert language is None, "❗ language is not supported for V1 model."
         elif self.model == "XTTS":
             assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model."
-            assert language is None, "❗ Language is not supported for XTTS model. Use XTTS-multilang model."
-        elif self.model == "XTTS-multilang":
-            assert emotion is None, f"❗ Emotions are not supported for XTTS-multilang model. Use V1 model."
-            assert language is not None, "❗ Language is required for XTTS-multilang model."
+            assert language is not None, "❗ Language is required for XTTS model."
             assert (
                 language in self.SUPPORTED_LANGUAGES
-            ), f"❗ Language {language} is not yet supported. Use one of: en, es, de, fr, it, pt, pl"
+            ), f"❗ Language {language} is not yet supported. Check https://docs.coqui.ai/reference/samples_xtts_create."
         return text, speaker_name, speaker_id, emotion, speed, language
 
     def tts(
@@ -255,7 +240,7 @@ def tts(
                 supported by `V1` model. Defaults to None.
             speed (float): Speed of the speech. 1.0 is normal speed.
             language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+                supported by `XTTS` model. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
         """
         self._check_token()
         self.ping_api()
@@ -305,7 +290,7 @@ def tts_to_file(
             speed (float): Speed of the speech. 1.0 is normal speed.
             pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
             language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+                supported by `XTTS` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
             file_path (str): Path to save the file. If None, a temporary file is created.
         """
         if file_path is None:
@@ -323,20 +308,7 @@ def tts_to_file(
     print(api.list_speakers_as_tts_models())
 
     ts = time.time()
-    wav, sr = api.tts("It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name)
-    print(f" [i] XTTS took {time.time() - ts:.2f}s")
-
-    filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav")
-
-    api = CS_API(model="XTTS-multilang")
-    print(api.speakers)
-
-    ts = time.time()
-    wav, sr = api.tts(
-        "It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name, language="en"
-    )
+    wav, sr = api.tts("It took me quite a long time to develop a voice.", language="en", speaker_name=api.speakers[0].name)
     print(f" [i] XTTS took {time.time() - ts:.2f}s")
 
-    filepath = api.tts_to_file(
-        text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav", language="en"
-    )
+    filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, language="en", file_path="output.wav")
diff --git a/docs/source/inference.md b/docs/source/inference.md
@@ -198,19 +198,12 @@ from TTS.api import CS_API
 # Init 🐸 Coqui Studio API
 # you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument.
 
-# XTTS - Best quality and life-like speech in EN
+# XTTS - Best quality and life-like speech in multiple languages. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
 api = CS_API(api_token=<token>, model="XTTS")
 api.speakers  # all the speakers are available with all the models.
 api.list_speakers()
 api.list_voices()
-wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
-
-# XTTS-multilingual - Multilingual XTTS with [en, de, es, fr, it, pt, ...] (more langs coming soon)
-api = CS_API(api_token=<token>, model="XTTS-multilingual")
-api.speakers
-api.list_speakers()
-api.list_voices()
-wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
+wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", language="en", speed=1.5)
 
 # V1 - Fast and lightweight TTS in EN with emotion control.
 api = CS_API(api_token=<token>, model="V1")