livekit · bcherry · Sep 11, 2024 · Sep 10, 2024 · Sep 10, 2024 · Sep 10, 2024
diff --git a/examples/text-to-speech/cartesia_tts.py b/examples/text-to-speech/cartesia_tts.py
@@ -0,0 +1,40 @@
+import asyncio
+import logging
+
+from dotenv import load_dotenv
+from livekit import rtc
+from livekit.agents import AutoSubscribe, JobContext, WorkerOptions, cli
+from livekit.plugins import cartesia
+
+load_dotenv()
+
+logger = logging.getLogger("cartesia-tts-demo")
+logger.setLevel(logging.INFO)
+
+
+async def entrypoint(job: JobContext):
+    logger.info("starting tts example agent")
+
+    tts = cartesia.TTS(voice_controls={"speed": "fastest", "emotion": ["surprise:highest"]})
+
+    source = rtc.AudioSource(tts.sample_rate, tts.num_channels)
+    track = rtc.LocalAudioTrack.create_audio_track("agent-mic", source)
+    options = rtc.TrackPublishOptions()
+    options.source = rtc.TrackSource.SOURCE_MICROPHONE
+
+    await job.connect(auto_subscribe=AutoSubscribe.SUBSCRIBE_NONE)
+    publication = await job.room.local_participant.publish_track(track, options)
+    await publication.wait_for_subscription()
+
+    logger.info('Saying "Hello!"')
+    async for output in tts.synthesize("Hello I hope you are having a great day."):
+        await source.capture_frame(output.frame)
+
+    await asyncio.sleep(4)
+    logger.info('Saying "Goodbye."')
+    async for output in tts.synthesize("Goodbye I hope to see you again soon."):
+        await source.capture_frame(output.frame)
+
+
+if __name__ == "__main__":
+    cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))
diff --git a/examples/text-to-speech/requirements.txt b/examples/text-to-speech/requirements.txt
@@ -1,2 +1,4 @@
 livekit-agents>=0.8.10
 livekit-plugins-openai>=0.8.1
+livekit-plugins-cartesia>=0.4.1
+livekit-plugins-elevenlabs>=0.7.3
diff --git a/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/tts.py b/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/tts.py
@@ -41,6 +41,7 @@ class _TTSOptions:
     encoding: TTSEncoding
     sample_rate: int
     voice: str | list[float]
+    voice_controls: dict[str, any] | None
     api_key: str
     language: str
 
@@ -53,15 +54,25 @@ def __init__(
         language: str = "en",
         encoding: TTSEncoding = "pcm_s16le",
         voice: str | list[float] = TTSDefaultVoiceId,
+        voice_controls: dict[str, any] | None = None,
         sample_rate: int = 24000,
         api_key: str | None = None,
         http_session: aiohttp.ClientSession | None = None,
     ) -> None:
         """
         Create a new instance of Cartesia TTS.
-
-        ``api_key`` must be set to your Cartesia API key, either using the argument or by setting
-        the ``CARTESIA_API_KEY`` environmental variable.
+
+        See https://docs.cartesia.ai/reference/web-socket/stream-speech/stream-speech for more details on the the Cartesia API.
+
+        Args:
+            model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-english".
+            language (str, optional): The language code for synthesis. Defaults to "en".
+            encoding (TTSEncoding, optional): The audio encoding format. Defaults to "pcm_s16le".
+            voice (str | list[float], optional): The voice ID or embedding array.
+            voice_controls (dict[str, any] | None, optional): Experimental voice control parameters. See https://docs.cartesia.ai/user-guides/voice-control for more details.
+            sample_rate (int, optional): The audio sample rate in Hz. Defaults to 24000.
+            api_key (str | None, optional): The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable.
+            http_session (aiohttp.ClientSession | None, optional): An existing aiohttp ClientSession to use. If not provided, a new session will be created.
         """
 
         super().__init__(
@@ -80,6 +91,7 @@ def __init__(
             encoding=encoding,
             sample_rate=sample_rate,
             voice=voice,
+            voice_controls=voice_controls,
             api_key=api_key,
         )
         self._session = http_session
@@ -274,6 +286,9 @@ def _to_cartesia_options(opts: _TTSOptions) -> dict[str, Any]:
     else:
         voice["mode"] = "embedding"
         voice["embedding"] = opts.voice
+
+    if opts.voice_controls:
+        voice["__experimental_controls"] = opts.voice_controls
 
     return {
         "model_id": opts.model,