Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added smallest.ai voice synthesizer to vocode #721

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions vocode/streaming/models/synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class SynthesizerType(str, Enum):
BARK = "synthesizer_bark"
POLLY = "synthesizer_polly"
CARTESIA = "synthesizer_cartesia"
SMALLEST = "synthesizer_smallest"


class SentimentConfig(BaseModel):
Expand Down Expand Up @@ -245,3 +246,13 @@ class CartesiaSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.CARTESIA
model_id: str = DEFAULT_CARTESIA_MODEL_ID
voice_id: str = DEFAULT_CARTESIA_VOICE_ID
experimental_voice_controls: Optional[CartesiaVoiceControls] = None

class SmallestSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.SMALLEST.value):
api_token: str
model:str = "lightning"
voice_id: str = "aravind"
language: str = "hi"
speed: float = 1
remove_extra_silence: bool = False
transliterate: bool = True
sampling_rate: int
43 changes: 39 additions & 4 deletions vocode/streaming/models/telephony.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.model import BaseModel, TypedModel
from vocode.streaming.models.synthesizer import AzureSynthesizerConfig, SynthesizerConfig
from vocode.streaming.models.synthesizer import SmallestSynthesizerConfig, SynthesizerConfig
from vocode.streaming.models.transcriber import (
DeepgramTranscriberConfig,
PunctuationEndpointingConfig,
Expand All @@ -29,6 +29,12 @@ class TwilioConfig(TelephonyProviderConfig):
extra_params: Optional[Dict[str, Any]] = {}
account_supports_any_caller_id: bool = True

class PlivoConfig(TelephonyProviderConfig):
auth_id: str
auth_token: str
extra_params: Optional[Dict[str, Any]] = {}
account_supports_any_caller_id: bool = True
dtmf_url: Optional[str]

class VonageConfig(TelephonyProviderConfig):
api_key: str
Expand All @@ -52,12 +58,14 @@ class CreateInboundCall(BaseModel):
conversation_id: Optional[str] = None
twilio_config: Optional[TwilioConfig] = None
vonage_config: Optional[VonageConfig] = None
plivo_config: Optional[PlivoConfig] = None


class EndOutboundCall(BaseModel):
call_id: str
vonage_config: Optional[VonageConfig] = None
twilio_config: Optional[TwilioConfig] = None
plivo_config: Optional[PlivoConfig] = None


class CreateOutboundCall(BaseModel):
Expand All @@ -69,6 +77,7 @@ class CreateOutboundCall(BaseModel):
conversation_id: Optional[str] = None
vonage_config: Optional[VonageConfig] = None
twilio_config: Optional[TwilioConfig] = None
plivo_config: Optional[PlivoConfig] = None
# TODO add IVR/etc.


Expand All @@ -83,12 +92,14 @@ class DialIntoZoomCall(BaseModel):
conversation_id: Optional[str] = None
vonage_config: Optional[VonageConfig] = None
twilio_config: Optional[TwilioConfig] = None
plivo_config: Optional[PlivoConfig] = None


class CallConfigType(str, Enum):
BASE = "call_config_base"
TWILIO = "call_config_twilio"
VONAGE = "call_config_vonage"
PLIVO = "call_config_plivo"


PhoneCallDirection = Literal["inbound", "outbound"]
Expand Down Expand Up @@ -131,7 +142,7 @@ def default_transcriber_config():

@staticmethod
def default_synthesizer_config():
return AzureSynthesizerConfig(
return SmallestSynthesizerConfig(
sampling_rate=DEFAULT_SAMPLING_RATE,
audio_encoding=DEFAULT_AUDIO_ENCODING,
)
Expand All @@ -155,10 +166,34 @@ def default_transcriber_config():

@staticmethod
def default_synthesizer_config():
return AzureSynthesizerConfig(
return SmallestSynthesizerConfig(
sampling_rate=VONAGE_SAMPLING_RATE,
audio_encoding=VONAGE_AUDIO_ENCODING,
)


TelephonyConfig = Union[TwilioConfig, VonageConfig]
class PlivoCallConfig(BaseCallConfig, type=CallConfigType.PLIVO.value): # type: ignore
plivo_config: PlivoConfig
plivo_id: str

@staticmethod
def default_transcriber_config():
return DeepgramTranscriberConfig(
sampling_rate=DEFAULT_SAMPLING_RATE,
audio_encoding=DEFAULT_AUDIO_ENCODING,
chunk_size=DEFAULT_CHUNK_SIZE,
model="nova-2",
language="hi",
# tier="nova-2",
endpointing_config=PunctuationEndpointingConfig(),
)

@staticmethod
def default_synthesizer_config():
return SmallestSynthesizerConfig(
sampling_rate=DEFAULT_SAMPLING_RATE,
audio_encoding=DEFAULT_AUDIO_ENCODING,
)


TelephonyConfig = Union[TwilioConfig, VonageConfig, PlivoConfig]
142 changes: 142 additions & 0 deletions vocode/streaming/output_device/plivo_output_device.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
from __future__ import annotations

import asyncio
import audioop
import base64
import json
from typing import List, Optional, Union

from fastapi import WebSocket
from fastapi.websockets import WebSocketState
from loguru import logger
from pydantic import BaseModel

from vocode.streaming.output_device.abstract_output_device import AbstractOutputDevice
from vocode.streaming.output_device.audio_chunk import AudioChunk, ChunkState
from vocode.streaming.telephony.constants import PLIVO_AUDIO_ENCODING, PLIVO_SAMPLING_RATE
from vocode.streaming.utils.create_task import asyncio_create_task
from vocode.streaming.utils.dtmf_utils import DTMFToneGenerator, KeypadEntry
from vocode.streaming.utils.worker import InterruptibleEvent


class ChunkFinishedMarkMessage(BaseModel):
chunk_id: str


MarkMessage = Union[ChunkFinishedMarkMessage] # space for more mark messages


class PlivoOutputDevice(AbstractOutputDevice):
def __init__(self, ws: Optional[WebSocket] = None, stream_sid: Optional[str] = None):
super().__init__(sampling_rate=PLIVO_SAMPLING_RATE, audio_encoding=PLIVO_AUDIO_ENCODING)
self.ws = ws
self.stream_sid = stream_sid
self.active = True

self._plivo_events_queue: asyncio.Queue[str] = asyncio.Queue()
self._mark_message_queue: asyncio.Queue[MarkMessage] = asyncio.Queue()
self._unprocessed_audio_chunks_queue: asyncio.Queue[InterruptibleEvent[AudioChunk]] = (
asyncio.Queue()
)

def consume_nonblocking(self, item: InterruptibleEvent[AudioChunk]):
if not item.is_interrupted():
self._send_audio_chunk_and_mark(
chunk=item.payload.data, chunk_id=str(item.payload.chunk_id)
)
self._unprocessed_audio_chunks_queue.put_nowait(item)
else:
audio_chunk = item.payload
audio_chunk.on_interrupt()
audio_chunk.state = ChunkState.INTERRUPTED

def interrupt(self):
self._send_clear_message()

def enqueue_mark_message(self, mark_message: MarkMessage):
self._mark_message_queue.put_nowait(mark_message)

def send_dtmf_tones(self, keypad_entries: List[KeypadEntry]):
tone_generator = DTMFToneGenerator()
for keypad_entry in keypad_entries:
logger.info(f"Sending DTMF tone {keypad_entry.value}")
dtmf_tone = tone_generator.generate(
keypad_entry, sampling_rate=self.sampling_rate, audio_encoding=self.audio_encoding
)
dtmf_message = {
"event": "media",
"streamSid": self.stream_sid,
"media": {"payload": base64.b64encode(dtmf_tone).decode("utf-8")},
}
self._plivo_events_queue.put_nowait(json.dumps(dtmf_message))

async def _send_plivo_messages(self):
while True:
try:
plivo_event = await self._plivo_events_queue.get()
except asyncio.CancelledError:
return
if self.ws.application_state == WebSocketState.DISCONNECTED:
break
await self.ws.send_text(plivo_event)

async def _process_mark_messages(self):
while True:
try:
# mark messages are tagged with the chunk ID that is attached to the audio chunk
# but they are guaranteed to come in the same order as the audio chunks, and we
# don't need to build resiliency there
# mark_message = await self._mark_message_queue.get()
item = await self._unprocessed_audio_chunks_queue.get()
except asyncio.CancelledError:
return

self.interruptible_event = item
audio_chunk = item.payload

# if mark_message.chunk_id != str(audio_chunk.chunk_id):
# logger.error(
# f"Received a mark message out of order with chunk ID {mark_message.chunk_id}"
# )

if item.is_interrupted():
audio_chunk.on_interrupt()
audio_chunk.state = ChunkState.INTERRUPTED
continue

audio_chunk.on_play()
audio_chunk.state = ChunkState.PLAYED

self.interruptible_event.is_interruptible = False

async def _run_loop(self):
send_plivo_messages_task = asyncio_create_task(self._send_plivo_messages())
process_mark_messages_task = asyncio_create_task(self._process_mark_messages())
await asyncio.gather(send_plivo_messages_task, process_mark_messages_task)

# TODO: Plivo
def _send_audio_chunk_and_mark(self, chunk: bytes, chunk_id: str):
media_message = {
'event': 'playAudio',
# 'streamSid': self.stream_sid,
'media': {
'payload': base64.b64encode(chunk).decode("utf-8"),
'sampleRate': 8000,
'contentType': 'audio/x-l16'
}
}
self._plivo_events_queue.put_nowait(json.dumps(media_message))

mark_message = {
"event": "checkpoint",
"streamId": self.stream_sid,
"name": chunk_id
}
self._plivo_events_queue.put_nowait(json.dumps(mark_message))

def _send_clear_message(self):
clear_message = {
"event": "clearAudio",
"streamId": self.stream_sid,
}
self._plivo_events_queue.put_nowait(json.dumps(clear_message))
8 changes: 8 additions & 0 deletions vocode/streaming/synthesizer/default_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@
RimeSynthesizerConfig,
StreamElementsSynthesizerConfig,
SynthesizerConfig,
SmallestSynthesizerConfig


)


from vocode.streaming.synthesizer.abstract_factory import AbstractSynthesizerFactory
from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
Expand All @@ -19,6 +24,7 @@
from vocode.streaming.synthesizer.play_ht_synthesizer_v2 import PlayHtSynthesizerV2
from vocode.streaming.synthesizer.rime_synthesizer import RimeSynthesizer
from vocode.streaming.synthesizer.stream_elements_synthesizer import StreamElementsSynthesizer
from vocode.streaming.synthesizer.smallest_synthesizer import SmallestSynthesizer


class DefaultSynthesizerFactory(AbstractSynthesizerFactory):
Expand All @@ -44,5 +50,7 @@ def create_synthesizer(
return RimeSynthesizer(synthesizer_config)
elif isinstance(synthesizer_config, StreamElementsSynthesizerConfig):
return StreamElementsSynthesizer(synthesizer_config)
elif isinstance(synthesizer_config, SmallestSynthesizerConfig):
return SmallestSynthesizer(synthesizer_config)
else:
raise Exception("Invalid synthesizer config")
87 changes: 87 additions & 0 deletions vocode/streaming/synthesizer/smallest_synthesizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import asyncio

import requests
from typing import AsyncGenerator, Optional

from vocode import getenv


from vocode.streaming.models.message import BaseMessage
from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer, SynthesisResult
from vocode.streaming.models.synthesizer import SynthesizerConfig

class SmallestSynthesizerConfig(SynthesizerConfig, type="synthesizer_smallest"):
api_token: str
model:str = "lightning"
voice_id: str = "aravind"
language: str = "hi"
speed: float = 1
remove_extra_silence: bool = False
transliterate: bool = True
sampling_rate: int

class SmallestSynthesizer(BaseSynthesizer[SmallestSynthesizerConfig]):
CHUNK_SIZE = 320

def __init__(self, synthesizer_config: SmallestSynthesizerConfig,smallest_api_key: Optional[str] = None):
super().__init__(synthesizer_config)
self.lightning_url = f"http://waves-api.smallest.ai/api/v1/{synthesizer_config.model}/get_speech"
smallest_api_key = smallest_api_key or getenv("SMALLEST_API_KEY")
if not smallest_api_key:
raise ValueError(
"Please set SMALLEST_API_KEY environment variable or pass it as a parameter"
)
self.synthesizer_config.api_token = smallest_api_key


async def create_speech(
self,
message: BaseMessage,
chunk_size: int,
is_first_text_chunk: bool = False,
is_sole_text_chunk: bool = False
) -> SynthesisResult:

chunk_size = 320


audio_data = await self._generate_audio(message.text)

async def chunk_generator() -> AsyncGenerator[SynthesisResult.ChunkResult, None]:
for i in range(0, len(audio_data), chunk_size):
chunk = audio_data[i:i+chunk_size]
yield SynthesisResult.ChunkResult(chunk, i+chunk_size >= len(audio_data))

def get_message_up_to(seconds: float) -> str:
# This is a simplified implementation. You might want to improve this
# based on the actual audio duration and text alignment.
return message.text

return SynthesisResult(chunk_generator(), get_message_up_to)

async def _generate_audio(self, text: str) -> bytes:
payload = {
"text": text,
"voice_id": self.synthesizer_config.voice_id,
"add_wav_header": False,
"sample_rate": self.synthesizer_config.sampling_rate,
"language": self.synthesizer_config.language,
"speed": self.synthesizer_config.speed,
"keep_ws_open": True,
"remove_extra_silence": self.synthesizer_config.remove_extra_silence,
"transliterate": self.synthesizer_config.transliterate,
"get_end_of_response_token": True,
}



headers = {
"Authorization": f"Bearer {self.synthesizer_config.api_token}",
"Content-Type": "application/json"
}


response = requests.request("POST", self.lightning_url, json=payload, headers=headers)


return response.content
Loading