From 516fa92f793279a63b424f39dd265ef6c887bafe Mon Sep 17 00:00:00 2001 From: joseph Date: Tue, 16 Jul 2024 00:45:51 +0700 Subject: [PATCH] support clova speech --- .../livekit-plugins-clova/README.md | 13 +++ .../livekit/plugins/clova/__init__.py | 21 ++++ .../livekit/plugins/clova/common.py | 13 +++ .../livekit/plugins/clova/constants.py | 2 + .../livekit/plugins/clova/log.py | 3 + .../livekit/plugins/clova/models.py | 14 +++ .../livekit/plugins/clova/stt.py | 107 ++++++++++++++++++ .../livekit/plugins/clova/version.py | 15 +++ .../livekit-plugins-clova/pyproject.toml | 3 + .../livekit-plugins-clova/setup.py | 58 ++++++++++ 10 files changed, 249 insertions(+) create mode 100644 livekit-plugins/livekit-plugins-clova/README.md create mode 100644 livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/__init__.py create mode 100644 livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/common.py create mode 100644 livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/constants.py create mode 100644 livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/log.py create mode 100644 livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/models.py create mode 100644 livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/stt.py create mode 100644 livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/version.py create mode 100644 livekit-plugins/livekit-plugins-clova/pyproject.toml create mode 100644 livekit-plugins/livekit-plugins-clova/setup.py diff --git a/livekit-plugins/livekit-plugins-clova/README.md b/livekit-plugins/livekit-plugins-clova/README.md new file mode 100644 index 0000000000..013cb7fe47 --- /dev/null +++ b/livekit-plugins/livekit-plugins-clova/README.md @@ -0,0 +1,13 @@ +# LiveKit Plugins Clova + +Agent Framework plugin for speech-to-text with [Clova](https://api.ncloud-docs.com/docs/)'s API. Currently supports speech-to-text. + +## Installation + +```bash +pip install livekit-plugins-clova +``` + +## Pre-requisites + +You need invoke url and secret key from Naver cloud platform -> Clova Speech and set as environment variables: `CLOVA_STT_INVOKE_URL` & `CLOVA_STT_SECRET_KEY` diff --git a/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/__init__.py b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/__init__.py new file mode 100644 index 0000000000..d554599f04 --- /dev/null +++ b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/__init__.py @@ -0,0 +1,21 @@ +from .stt import STT +from .version import __version__ + +__all__ = [ + "STT", + "__version__", +] + + +from livekit.agents import Plugin + + +class ClovaSTTPlugin(Plugin): + def __init__(self): + super().__init__(__name__, __version__, __package__) + + def download_files(self): + pass + + +Plugin.register_plugin(ClovaSTTPlugin()) diff --git a/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/common.py b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/common.py new file mode 100644 index 0000000000..3418dd8bf2 --- /dev/null +++ b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/common.py @@ -0,0 +1,13 @@ +import io + +from pydub import AudioSegment + + +def resample_audio(audio_bytes, original_sample_rate, target_sample_rate): + resampled_audio = AudioSegment.from_raw( + io.BytesIO(audio_bytes), + sample_width=2, + frame_rate=original_sample_rate, + channels=1, + ).set_frame_rate(target_sample_rate) + return resampled_audio.raw_data diff --git a/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/constants.py b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/constants.py new file mode 100644 index 0000000000..ec109084f6 --- /dev/null +++ b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/constants.py @@ -0,0 +1,2 @@ +CLOVA_INPUT_SAMPLE_RATE = 16000 +LIVEKIT_INPUT_SAMPLE_RATE = 48000 diff --git a/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/log.py b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/log.py new file mode 100644 index 0000000000..e28e00f47f --- /dev/null +++ b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/log.py @@ -0,0 +1,3 @@ +import logging + +logger = logging.getLogger("livekit.plugins.clova") diff --git a/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/models.py b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/models.py new file mode 100644 index 0000000000..2dad6f980b --- /dev/null +++ b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/models.py @@ -0,0 +1,14 @@ +from typing import Literal + +ClovaSttLanguages = Literal[ + "zh", + "zh-CN", + "zh-TW", + "ko", + "ja", + "en", +] + +ClovaSpeechAPIType = Literal[ + "recognizer/object-storage", "recognizer/url", "recognizer/upload" +] diff --git a/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/stt.py b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/stt.py new file mode 100644 index 0000000000..eea28224ee --- /dev/null +++ b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/stt.py @@ -0,0 +1,107 @@ +# Copyright 2023 LiveKit, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import io +import json +import os +import wave +from typing import Optional, Union + +import aiohttp +from livekit.agents import stt, utils +from livekit.agents.utils import AudioBuffer, merge_frames +from livekit.plugins.clova.constants import CLOVA_INPUT_SAMPLE_RATE + +from .common import resample_audio +from .log import logger +from .models import ClovaSttLanguages, ClovaSpeechAPIType + + +class STT(stt.STT): + def __init__( + self, + *, + language: ClovaSttLanguages = "en-US", + secret: Optional[str] = None, + invoke_url: Optional[str] = None, + http_session: Optional[aiohttp.ClientSession] = None, + use_grpc: bool = False, + ): + super().__init__(streaming_supported=False) + self._secret = secret or os.environ.get("CLOVA_STT_SECRET_KEY") + self._invoke_url = invoke_url or os.environ.get("CLOVA_STT_INVOKE_URL") + self._language = language + self._session = http_session + self._use_grpc = use_grpc + if self._secret is None: + raise ValueError( + "Clova STT secret key is required. It should be set with env CLOVA_STT_SECRET_KEY" + ) + + def _ensure_session(self) -> aiohttp.ClientSession: + if not self._session: + self._session = utils.http_context.http_session() + return self._session + + def url_builder( + self, process_method: ClovaSpeechAPIType = "recognizer/upload" + ) -> str: + return f"{self._invoke_url}/{process_method}" + + async def recognize( + self, + *, + buffer: AudioBuffer, + language: Union[ClovaSttLanguages, str, None] = None, + ) -> stt.SpeechEvent: + url = self.url_builder() + payload = json.dumps({"language": self._language, "completion": "sync"}) + + buffer = merge_frames(buffer) + buffer_bytes = resample_audio( + buffer.data.tobytes(), buffer.sample_rate, CLOVA_INPUT_SAMPLE_RATE + ) + + io_buffer = io.BytesIO() + with wave.open(io_buffer, "wb") as wav: + wav.setnchannels(1) + wav.setsampwidth(2) # 16-bit + wav.setframerate(CLOVA_INPUT_SAMPLE_RATE) + wav.writeframes(buffer_bytes) + io_buffer.seek(0) + + headers = {"X-CLOVASPEECH-API-KEY": self._secret} + form_data = aiohttp.FormData() + form_data.add_field("params", payload) + form_data.add_field( + "media", io_buffer, filename="audio.wav", content_type="audio/wav" + ) + + async with self._ensure_session().post( + url, data=form_data, headers=headers + ) as response: + response_data = await response.json() + text = response_data.get("text") + + if not text or "error" in response_data: + raise ValueError(f"Unexpected response: {response_data}") + + logger.info(f"final event: {response_data}") + return self._transcription_to_speech_event(text=text) + + def _transcription_to_speech_event(self, text: str) -> stt.SpeechEvent: + return stt.SpeechEvent( + type=stt.SpeechEventType.INTERIM_TRANSCRIPT, + alternatives=[stt.SpeechData(text=text, language=self._language)], + ) diff --git a/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/version.py b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/version.py new file mode 100644 index 0000000000..b57ea55083 --- /dev/null +++ b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/version.py @@ -0,0 +1,15 @@ +# Copyright 2023 LiveKit, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = "0.0.1" diff --git a/livekit-plugins/livekit-plugins-clova/pyproject.toml b/livekit-plugins/livekit-plugins-clova/pyproject.toml new file mode 100644 index 0000000000..8cf32563a5 --- /dev/null +++ b/livekit-plugins/livekit-plugins-clova/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/livekit-plugins/livekit-plugins-clova/setup.py b/livekit-plugins/livekit-plugins-clova/setup.py new file mode 100644 index 0000000000..8a2a1687e2 --- /dev/null +++ b/livekit-plugins/livekit-plugins-clova/setup.py @@ -0,0 +1,58 @@ +# Copyright 2023 LiveKit, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pathlib + +import setuptools +import setuptools.command.build_py + +here = pathlib.Path(__file__).parent.resolve() +about = {} +with open(os.path.join(here, "livekit", "plugins", "clova", "version.py"), "r") as f: + exec(f.read(), about) + + +setuptools.setup( + name="livekit-plugins-clova", + version=about["__version__"], + description="LiveKit Agents Plugin for LINE Clova STT", + long_description=(here / "README.md").read_text(encoding="utf-8"), + long_description_content_type="text/markdown", + url="https://github.com/livekit/agents", + cmdclass={}, + classifiers=[ + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Topic :: Multimedia :: Sound/Audio", + "Topic :: Multimedia :: Video", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3 :: Only", + ], + keywords=["webrtc", "realtime", "audio", "video", "livekit"], + license="Apache-2.0", + packages=setuptools.find_namespace_packages(include=["livekit.*"]), + python_requires=">=3.9.0", + install_requires=[ + "livekit-agents~=0.7", + ], + project_urls={ + "Documentation": "https://docs.livekit.io", + "Website": "https://livekit.io/", + "Source": "https://github.com/livekit/agents", + }, +)