vocodedev · Divyansh-opp · Oct 11, 2024 · Oct 13, 2024 · Oct 14, 2024 · Oct 14, 2024
diff --git a/vocode/streaming/models/synthesizer.py b/vocode/streaming/models/synthesizer.py
@@ -25,6 +25,7 @@ class SynthesizerType(str, Enum):
     BARK = "synthesizer_bark"
     POLLY = "synthesizer_polly"
     CARTESIA = "synthesizer_cartesia"
+    SMALLEST = "synthesizer_smallest"
 
 
 class SentimentConfig(BaseModel):
@@ -245,3 +246,13 @@ class CartesiaSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.CARTESIA
     model_id: str = DEFAULT_CARTESIA_MODEL_ID
     voice_id: str = DEFAULT_CARTESIA_VOICE_ID
     experimental_voice_controls: Optional[CartesiaVoiceControls] = None
+
+class SmallestSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.SMALLEST.value):
+    api_token: str 
+    model:str = "lightning"
+    voice_id: str = "aravind"
+    language: str = "hi"
+    speed: float = 1
+    remove_extra_silence: bool = False
+    transliterate: bool = True
+    sampling_rate: int 
diff --git a/vocode/streaming/models/telephony.py b/vocode/streaming/models/telephony.py
@@ -3,7 +3,7 @@
 
 from vocode.streaming.models.agent import AgentConfig
 from vocode.streaming.models.model import BaseModel, TypedModel
-from vocode.streaming.models.synthesizer import AzureSynthesizerConfig, SynthesizerConfig
+from vocode.streaming.models.synthesizer import SmallestSynthesizerConfig, SynthesizerConfig
 from vocode.streaming.models.transcriber import (
     DeepgramTranscriberConfig,
     PunctuationEndpointingConfig,
@@ -29,6 +29,12 @@ class TwilioConfig(TelephonyProviderConfig):
     extra_params: Optional[Dict[str, Any]] = {}
     account_supports_any_caller_id: bool = True
 
+class PlivoConfig(TelephonyProviderConfig):
+    auth_id: str
+    auth_token: str
+    extra_params: Optional[Dict[str, Any]] = {}
+    account_supports_any_caller_id: bool = True
+    dtmf_url: Optional[str]
 
 class VonageConfig(TelephonyProviderConfig):
     api_key: str
@@ -52,12 +58,14 @@ class CreateInboundCall(BaseModel):
     conversation_id: Optional[str] = None
     twilio_config: Optional[TwilioConfig] = None
     vonage_config: Optional[VonageConfig] = None
+    plivo_config: Optional[PlivoConfig] = None
 
 
 class EndOutboundCall(BaseModel):
     call_id: str
     vonage_config: Optional[VonageConfig] = None
     twilio_config: Optional[TwilioConfig] = None
+    plivo_config: Optional[PlivoConfig] = None
 
 
 class CreateOutboundCall(BaseModel):
@@ -69,6 +77,7 @@ class CreateOutboundCall(BaseModel):
     conversation_id: Optional[str] = None
     vonage_config: Optional[VonageConfig] = None
     twilio_config: Optional[TwilioConfig] = None
+    plivo_config: Optional[PlivoConfig] = None
     # TODO add IVR/etc.
 
 
@@ -83,12 +92,14 @@ class DialIntoZoomCall(BaseModel):
     conversation_id: Optional[str] = None
     vonage_config: Optional[VonageConfig] = None
     twilio_config: Optional[TwilioConfig] = None
+    plivo_config: Optional[PlivoConfig] = None
 
 
 class CallConfigType(str, Enum):
     BASE = "call_config_base"
     TWILIO = "call_config_twilio"
     VONAGE = "call_config_vonage"
+    PLIVO = "call_config_plivo"
 
 
 PhoneCallDirection = Literal["inbound", "outbound"]
@@ -131,7 +142,7 @@ def default_transcriber_config():
 
     @staticmethod
     def default_synthesizer_config():
-        return AzureSynthesizerConfig(
+        return SmallestSynthesizerConfig(
             sampling_rate=DEFAULT_SAMPLING_RATE,
             audio_encoding=DEFAULT_AUDIO_ENCODING,
         )
@@ -155,10 +166,34 @@ def default_transcriber_config():
 
     @staticmethod
     def default_synthesizer_config():
-        return AzureSynthesizerConfig(
+        return SmallestSynthesizerConfig(
             sampling_rate=VONAGE_SAMPLING_RATE,
             audio_encoding=VONAGE_AUDIO_ENCODING,
         )
 
 
-TelephonyConfig = Union[TwilioConfig, VonageConfig]
+class PlivoCallConfig(BaseCallConfig, type=CallConfigType.PLIVO.value):  # type: ignore
+    plivo_config: PlivoConfig
+    plivo_id: str
+
+    @staticmethod
+    def default_transcriber_config():
+        return DeepgramTranscriberConfig(
+            sampling_rate=DEFAULT_SAMPLING_RATE,
+            audio_encoding=DEFAULT_AUDIO_ENCODING,
+            chunk_size=DEFAULT_CHUNK_SIZE,
+            model="nova-2",
+            language="hi",
+            # tier="nova-2",
+            endpointing_config=PunctuationEndpointingConfig(),
+        )
+
+    @staticmethod
+    def default_synthesizer_config():
+        return SmallestSynthesizerConfig(
+            sampling_rate=DEFAULT_SAMPLING_RATE,
+            audio_encoding=DEFAULT_AUDIO_ENCODING,
+        )
+
+
+TelephonyConfig = Union[TwilioConfig, VonageConfig, PlivoConfig]
diff --git a/vocode/streaming/output_device/plivo_output_device.py b/vocode/streaming/output_device/plivo_output_device.py
@@ -0,0 +1,142 @@
+from __future__ import annotations
+
+import asyncio
+import audioop
+import base64
+import json
+from typing import List, Optional, Union
+
+from fastapi import WebSocket
+from fastapi.websockets import WebSocketState
+from loguru import logger
+from pydantic import BaseModel
+
+from vocode.streaming.output_device.abstract_output_device import AbstractOutputDevice
+from vocode.streaming.output_device.audio_chunk import AudioChunk, ChunkState
+from vocode.streaming.telephony.constants import PLIVO_AUDIO_ENCODING, PLIVO_SAMPLING_RATE
+from vocode.streaming.utils.create_task import asyncio_create_task
+from vocode.streaming.utils.dtmf_utils import DTMFToneGenerator, KeypadEntry
+from vocode.streaming.utils.worker import InterruptibleEvent
+
+
+class ChunkFinishedMarkMessage(BaseModel):
+    chunk_id: str
+
+
+MarkMessage = Union[ChunkFinishedMarkMessage]  # space for more mark messages
+
+
+class PlivoOutputDevice(AbstractOutputDevice):
+    def __init__(self, ws: Optional[WebSocket] = None, stream_sid: Optional[str] = None):
+        super().__init__(sampling_rate=PLIVO_SAMPLING_RATE, audio_encoding=PLIVO_AUDIO_ENCODING)
+        self.ws = ws
+        self.stream_sid = stream_sid
+        self.active = True
+
+        self._plivo_events_queue: asyncio.Queue[str] = asyncio.Queue()
+        self._mark_message_queue: asyncio.Queue[MarkMessage] = asyncio.Queue()
+        self._unprocessed_audio_chunks_queue: asyncio.Queue[InterruptibleEvent[AudioChunk]] = (
+            asyncio.Queue()
+        )
+
+    def consume_nonblocking(self, item: InterruptibleEvent[AudioChunk]):
+        if not item.is_interrupted():
+            self._send_audio_chunk_and_mark(
+                chunk=item.payload.data, chunk_id=str(item.payload.chunk_id)
+            )
+            self._unprocessed_audio_chunks_queue.put_nowait(item)
+        else:
+            audio_chunk = item.payload
+            audio_chunk.on_interrupt()
+            audio_chunk.state = ChunkState.INTERRUPTED
+
+    def interrupt(self):
+        self._send_clear_message()
+
+    def enqueue_mark_message(self, mark_message: MarkMessage):
+        self._mark_message_queue.put_nowait(mark_message)
+
+    def send_dtmf_tones(self, keypad_entries: List[KeypadEntry]):
+        tone_generator = DTMFToneGenerator()
+        for keypad_entry in keypad_entries:
+            logger.info(f"Sending DTMF tone {keypad_entry.value}")
+            dtmf_tone = tone_generator.generate(
+                keypad_entry, sampling_rate=self.sampling_rate, audio_encoding=self.audio_encoding
+            )
+            dtmf_message = {
+                "event": "media",
+                "streamSid": self.stream_sid,
+                "media": {"payload": base64.b64encode(dtmf_tone).decode("utf-8")},
+            }
+            self._plivo_events_queue.put_nowait(json.dumps(dtmf_message))
+
+    async def _send_plivo_messages(self):
+        while True:
+            try:
+                plivo_event = await self._plivo_events_queue.get()
+            except asyncio.CancelledError:
+                return
+            if self.ws.application_state == WebSocketState.DISCONNECTED:
+                break
+            await self.ws.send_text(plivo_event)
+
+    async def _process_mark_messages(self):
+        while True:
+            try:
+                # mark messages are tagged with the chunk ID that is attached to the audio chunk
+                # but they are guaranteed to come in the same order as the audio chunks, and we
+                # don't need to build resiliency there
+                # mark_message = await self._mark_message_queue.get()
+                item = await self._unprocessed_audio_chunks_queue.get()
+            except asyncio.CancelledError:
+                return
+
+            self.interruptible_event = item
+            audio_chunk = item.payload
+
+            # if mark_message.chunk_id != str(audio_chunk.chunk_id):
+            #     logger.error(
+            #         f"Received a mark message out of order with chunk ID {mark_message.chunk_id}"
+            #     )
+
+            if item.is_interrupted():
+                audio_chunk.on_interrupt()
+                audio_chunk.state = ChunkState.INTERRUPTED
+                continue
+
+            audio_chunk.on_play()
+            audio_chunk.state = ChunkState.PLAYED
+
+            self.interruptible_event.is_interruptible = False
+
+    async def _run_loop(self):
+        send_plivo_messages_task = asyncio_create_task(self._send_plivo_messages())
+        process_mark_messages_task = asyncio_create_task(self._process_mark_messages())
+        await asyncio.gather(send_plivo_messages_task, process_mark_messages_task)
+
+    # TODO: Plivo
+    def _send_audio_chunk_and_mark(self, chunk: bytes, chunk_id: str):
+        media_message = {
+            'event': 'playAudio',
+            # 'streamSid': self.stream_sid,
+            'media': {
+                'payload': base64.b64encode(chunk).decode("utf-8"),
+                'sampleRate': 8000,
+                'contentType': 'audio/x-l16'
+            }
+        }
+        self._plivo_events_queue.put_nowait(json.dumps(media_message))
+
+        mark_message = {
+            "event": "checkpoint",
+            "streamId": self.stream_sid,
+            "name": chunk_id
+        }
+        self._plivo_events_queue.put_nowait(json.dumps(mark_message))
+
+    def _send_clear_message(self):
+        clear_message = {
+            "event": "clearAudio",
+            "streamId": self.stream_sid,
+        }
+        self._plivo_events_queue.put_nowait(json.dumps(clear_message))
diff --git a/vocode/streaming/synthesizer/default_factory.py b/vocode/streaming/synthesizer/default_factory.py
@@ -8,7 +8,12 @@
     RimeSynthesizerConfig,
     StreamElementsSynthesizerConfig,
     SynthesizerConfig,
+    SmallestSynthesizerConfig
+
+
 )
+
+
 from vocode.streaming.synthesizer.abstract_factory import AbstractSynthesizerFactory
 from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
 from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
@@ -19,6 +24,7 @@
 from vocode.streaming.synthesizer.play_ht_synthesizer_v2 import PlayHtSynthesizerV2
 from vocode.streaming.synthesizer.rime_synthesizer import RimeSynthesizer
 from vocode.streaming.synthesizer.stream_elements_synthesizer import StreamElementsSynthesizer
+from vocode.streaming.synthesizer.smallest_synthesizer import SmallestSynthesizer
 
 
 class DefaultSynthesizerFactory(AbstractSynthesizerFactory):
@@ -44,5 +50,7 @@ def create_synthesizer(
             return RimeSynthesizer(synthesizer_config)
         elif isinstance(synthesizer_config, StreamElementsSynthesizerConfig):
             return StreamElementsSynthesizer(synthesizer_config)
+        elif isinstance(synthesizer_config, SmallestSynthesizerConfig):
+            return SmallestSynthesizer(synthesizer_config)
         else:
             raise Exception("Invalid synthesizer config")
diff --git a/vocode/streaming/synthesizer/smallest_synthesizer.py b/vocode/streaming/synthesizer/smallest_synthesizer.py
@@ -0,0 +1,87 @@
+import asyncio
+
+import requests
+from typing import AsyncGenerator, Optional
+
+from vocode import getenv
+
+
+from vocode.streaming.models.message import BaseMessage
+from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer, SynthesisResult
+from vocode.streaming.models.synthesizer import SynthesizerConfig
+
+class SmallestSynthesizerConfig(SynthesizerConfig, type="synthesizer_smallest"):
+    api_token: str 
+    model:str = "lightning"
+    voice_id: str = "aravind"
+    language: str = "hi"
+    speed: float = 1
+    remove_extra_silence: bool = False
+    transliterate: bool = True
+    sampling_rate: int 
+
+class SmallestSynthesizer(BaseSynthesizer[SmallestSynthesizerConfig]):
+    CHUNK_SIZE = 320
+
+    def __init__(self, synthesizer_config: SmallestSynthesizerConfig,smallest_api_key: Optional[str] = None):
+        super().__init__(synthesizer_config)
+        self.lightning_url = f"http://waves-api.smallest.ai/api/v1/{synthesizer_config.model}/get_speech"
+        smallest_api_key = smallest_api_key or getenv("SMALLEST_API_KEY")
+        if not smallest_api_key:
+            raise ValueError(
+                "Please set SMALLEST_API_KEY environment variable or pass it as a parameter"
+            )
+        self.synthesizer_config.api_token = smallest_api_key
+
+
+    async def create_speech(
+            self,
+            message: BaseMessage,
+            chunk_size: int,
+            is_first_text_chunk: bool = False,
+            is_sole_text_chunk: bool = False
+        ) -> SynthesisResult:
+
+            chunk_size = 320
+
+
+            audio_data = await self._generate_audio(message.text)
+
+            async def chunk_generator() -> AsyncGenerator[SynthesisResult.ChunkResult, None]:
+                for i in range(0, len(audio_data), chunk_size):
+                    chunk = audio_data[i:i+chunk_size]
+                    yield SynthesisResult.ChunkResult(chunk, i+chunk_size >= len(audio_data))
+
+            def get_message_up_to(seconds: float) -> str:
+                # This is a simplified implementation. You might want to improve this
+                # based on the actual audio duration and text alignment.
+                return message.text
+
+            return SynthesisResult(chunk_generator(), get_message_up_to)
+
+    async def _generate_audio(self, text: str) -> bytes:
+        payload = {
+            "text": text,
+            "voice_id": self.synthesizer_config.voice_id,
+            "add_wav_header": False,
+            "sample_rate": self.synthesizer_config.sampling_rate,
+            "language": self.synthesizer_config.language,
+            "speed": self.synthesizer_config.speed,
+            "keep_ws_open": True,
+            "remove_extra_silence": self.synthesizer_config.remove_extra_silence,
+            "transliterate": self.synthesizer_config.transliterate,
+            "get_end_of_response_token": True,
+        }
+
+
+
+        headers = {
+            "Authorization": f"Bearer {self.synthesizer_config.api_token}",
+            "Content-Type": "application/json"
+        }
+
+
+        response = requests.request("POST", self.lightning_url, json=payload, headers=headers)
+
+
+        return response.content