Skip to content

Commit

Permalink
support clova speech
Browse files Browse the repository at this point in the history
  • Loading branch information
josephkieu committed Jul 15, 2024
1 parent f948c36 commit 516fa92
Show file tree
Hide file tree
Showing 10 changed files with 249 additions and 0 deletions.
13 changes: 13 additions & 0 deletions livekit-plugins/livekit-plugins-clova/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# LiveKit Plugins Clova

Agent Framework plugin for speech-to-text with [Clova](https://api.ncloud-docs.com/docs/)'s API. Currently supports speech-to-text.

## Installation

```bash
pip install livekit-plugins-clova
```

## Pre-requisites

You need invoke url and secret key from Naver cloud platform -> Clova Speech and set as environment variables: `CLOVA_STT_INVOKE_URL` & `CLOVA_STT_SECRET_KEY`
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from .stt import STT
from .version import __version__

__all__ = [
"STT",
"__version__",
]


from livekit.agents import Plugin


class ClovaSTTPlugin(Plugin):
def __init__(self):
super().__init__(__name__, __version__, __package__)

def download_files(self):
pass


Plugin.register_plugin(ClovaSTTPlugin())
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import io

from pydub import AudioSegment


def resample_audio(audio_bytes, original_sample_rate, target_sample_rate):
resampled_audio = AudioSegment.from_raw(
io.BytesIO(audio_bytes),
sample_width=2,
frame_rate=original_sample_rate,
channels=1,
).set_frame_rate(target_sample_rate)
return resampled_audio.raw_data
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
CLOVA_INPUT_SAMPLE_RATE = 16000
LIVEKIT_INPUT_SAMPLE_RATE = 48000
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import logging

logger = logging.getLogger("livekit.plugins.clova")
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from typing import Literal

ClovaSttLanguages = Literal[
"zh",
"zh-CN",
"zh-TW",
"ko",
"ja",
"en",
]

ClovaSpeechAPIType = Literal[
"recognizer/object-storage", "recognizer/url", "recognizer/upload"
]
107 changes: 107 additions & 0 deletions livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/stt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# Copyright 2023 LiveKit, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import json
import os
import wave
from typing import Optional, Union

import aiohttp
from livekit.agents import stt, utils
from livekit.agents.utils import AudioBuffer, merge_frames
from livekit.plugins.clova.constants import CLOVA_INPUT_SAMPLE_RATE

from .common import resample_audio
from .log import logger
from .models import ClovaSttLanguages, ClovaSpeechAPIType


class STT(stt.STT):
def __init__(
self,
*,
language: ClovaSttLanguages = "en-US",
secret: Optional[str] = None,
invoke_url: Optional[str] = None,
http_session: Optional[aiohttp.ClientSession] = None,
use_grpc: bool = False,
):
super().__init__(streaming_supported=False)
self._secret = secret or os.environ.get("CLOVA_STT_SECRET_KEY")
self._invoke_url = invoke_url or os.environ.get("CLOVA_STT_INVOKE_URL")
self._language = language
self._session = http_session
self._use_grpc = use_grpc
if self._secret is None:
raise ValueError(
"Clova STT secret key is required. It should be set with env CLOVA_STT_SECRET_KEY"
)

def _ensure_session(self) -> aiohttp.ClientSession:
if not self._session:
self._session = utils.http_context.http_session()
return self._session

def url_builder(
self, process_method: ClovaSpeechAPIType = "recognizer/upload"
) -> str:
return f"{self._invoke_url}/{process_method}"

async def recognize(
self,
*,
buffer: AudioBuffer,
language: Union[ClovaSttLanguages, str, None] = None,
) -> stt.SpeechEvent:
url = self.url_builder()
payload = json.dumps({"language": self._language, "completion": "sync"})

buffer = merge_frames(buffer)
buffer_bytes = resample_audio(
buffer.data.tobytes(), buffer.sample_rate, CLOVA_INPUT_SAMPLE_RATE
)

io_buffer = io.BytesIO()
with wave.open(io_buffer, "wb") as wav:
wav.setnchannels(1)
wav.setsampwidth(2) # 16-bit
wav.setframerate(CLOVA_INPUT_SAMPLE_RATE)
wav.writeframes(buffer_bytes)
io_buffer.seek(0)

headers = {"X-CLOVASPEECH-API-KEY": self._secret}
form_data = aiohttp.FormData()
form_data.add_field("params", payload)
form_data.add_field(
"media", io_buffer, filename="audio.wav", content_type="audio/wav"
)

async with self._ensure_session().post(
url, data=form_data, headers=headers
) as response:
response_data = await response.json()
text = response_data.get("text")

if not text or "error" in response_data:
raise ValueError(f"Unexpected response: {response_data}")

logger.info(f"final event: {response_data}")
return self._transcription_to_speech_event(text=text)

def _transcription_to_speech_event(self, text: str) -> stt.SpeechEvent:
return stt.SpeechEvent(
type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
alternatives=[stt.SpeechData(text=text, language=self._language)],
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright 2023 LiveKit, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "0.0.1"
3 changes: 3 additions & 0 deletions livekit-plugins/livekit-plugins-clova/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[build-system]
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"
58 changes: 58 additions & 0 deletions livekit-plugins/livekit-plugins-clova/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Copyright 2023 LiveKit, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import pathlib

import setuptools
import setuptools.command.build_py

here = pathlib.Path(__file__).parent.resolve()
about = {}
with open(os.path.join(here, "livekit", "plugins", "clova", "version.py"), "r") as f:
exec(f.read(), about)


setuptools.setup(
name="livekit-plugins-clova",
version=about["__version__"],
description="LiveKit Agents Plugin for LINE Clova STT",
long_description=(here / "README.md").read_text(encoding="utf-8"),
long_description_content_type="text/markdown",
url="https://github.com/livekit/agents",
cmdclass={},
classifiers=[
"Intended Audience :: Developers",
"License :: OSI Approved :: Apache Software License",
"Topic :: Multimedia :: Sound/Audio",
"Topic :: Multimedia :: Video",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3 :: Only",
],
keywords=["webrtc", "realtime", "audio", "video", "livekit"],
license="Apache-2.0",
packages=setuptools.find_namespace_packages(include=["livekit.*"]),
python_requires=">=3.9.0",
install_requires=[
"livekit-agents~=0.7",
],
project_urls={
"Documentation": "https://docs.livekit.io",
"Website": "https://livekit.io/",
"Source": "https://github.com/livekit/agents",
},
)

0 comments on commit 516fa92

Please sign in to comment.