From 516fa92f793279a63b424f39dd265ef6c887bafe Mon Sep 17 00:00:00 2001
From: joseph <joseph@castalk.com>
Date: Tue, 16 Jul 2024 00:45:51 +0700
Subject: [PATCH] support clova speech

---
 .../livekit-plugins-clova/README.md           |  13 +++
 .../livekit/plugins/clova/__init__.py         |  21 ++++
 .../livekit/plugins/clova/common.py           |  13 +++
 .../livekit/plugins/clova/constants.py        |   2 +
 .../livekit/plugins/clova/log.py              |   3 +
 .../livekit/plugins/clova/models.py           |  14 +++
 .../livekit/plugins/clova/stt.py              | 107 ++++++++++++++++++
 .../livekit/plugins/clova/version.py          |  15 +++
 .../livekit-plugins-clova/pyproject.toml      |   3 +
 .../livekit-plugins-clova/setup.py            |  58 ++++++++++
 10 files changed, 249 insertions(+)
 create mode 100644 livekit-plugins/livekit-plugins-clova/README.md
 create mode 100644 livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/__init__.py
 create mode 100644 livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/common.py
 create mode 100644 livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/constants.py
 create mode 100644 livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/log.py
 create mode 100644 livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/models.py
 create mode 100644 livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/stt.py
 create mode 100644 livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/version.py
 create mode 100644 livekit-plugins/livekit-plugins-clova/pyproject.toml
 create mode 100644 livekit-plugins/livekit-plugins-clova/setup.py

diff --git a/livekit-plugins/livekit-plugins-clova/README.md b/livekit-plugins/livekit-plugins-clova/README.md
new file mode 100644
index 0000000000..013cb7fe47
--- /dev/null
+++ b/livekit-plugins/livekit-plugins-clova/README.md
@@ -0,0 +1,13 @@
+# LiveKit Plugins Clova
+
+Agent Framework plugin for speech-to-text with [Clova](https://api.ncloud-docs.com/docs/)'s API. Currently supports speech-to-text.
+
+## Installation
+
+```bash
+pip install livekit-plugins-clova
+```
+
+## Pre-requisites
+
+You need invoke url and secret key from Naver cloud platform -> Clova Speech and set as environment variables: `CLOVA_STT_INVOKE_URL` & `CLOVA_STT_SECRET_KEY`
diff --git a/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/__init__.py b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/__init__.py
new file mode 100644
index 0000000000..d554599f04
--- /dev/null
+++ b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/__init__.py
@@ -0,0 +1,21 @@
+from .stt import STT
+from .version import __version__
+
+__all__ = [
+    "STT",
+    "__version__",
+]
+
+
+from livekit.agents import Plugin
+
+
+class ClovaSTTPlugin(Plugin):
+    def __init__(self):
+        super().__init__(__name__, __version__, __package__)
+
+    def download_files(self):
+        pass
+
+
+Plugin.register_plugin(ClovaSTTPlugin())
diff --git a/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/common.py b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/common.py
new file mode 100644
index 0000000000..3418dd8bf2
--- /dev/null
+++ b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/common.py
@@ -0,0 +1,13 @@
+import io
+
+from pydub import AudioSegment
+
+
+def resample_audio(audio_bytes, original_sample_rate, target_sample_rate):
+    resampled_audio = AudioSegment.from_raw(
+        io.BytesIO(audio_bytes),
+        sample_width=2,
+        frame_rate=original_sample_rate,
+        channels=1,
+    ).set_frame_rate(target_sample_rate)
+    return resampled_audio.raw_data
diff --git a/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/constants.py b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/constants.py
new file mode 100644
index 0000000000..ec109084f6
--- /dev/null
+++ b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/constants.py
@@ -0,0 +1,2 @@
+CLOVA_INPUT_SAMPLE_RATE = 16000
+LIVEKIT_INPUT_SAMPLE_RATE = 48000
diff --git a/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/log.py b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/log.py
new file mode 100644
index 0000000000..e28e00f47f
--- /dev/null
+++ b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/log.py
@@ -0,0 +1,3 @@
+import logging
+
+logger = logging.getLogger("livekit.plugins.clova")
diff --git a/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/models.py b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/models.py
new file mode 100644
index 0000000000..2dad6f980b
--- /dev/null
+++ b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/models.py
@@ -0,0 +1,14 @@
+from typing import Literal
+
+ClovaSttLanguages = Literal[
+    "zh",
+    "zh-CN",
+    "zh-TW",
+    "ko",
+    "ja",
+    "en",
+]
+
+ClovaSpeechAPIType = Literal[
+    "recognizer/object-storage", "recognizer/url", "recognizer/upload"
+]
diff --git a/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/stt.py b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/stt.py
new file mode 100644
index 0000000000..eea28224ee
--- /dev/null
+++ b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/stt.py
@@ -0,0 +1,107 @@
+# Copyright 2023 LiveKit, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+import json
+import os
+import wave
+from typing import Optional, Union
+
+import aiohttp
+from livekit.agents import stt, utils
+from livekit.agents.utils import AudioBuffer, merge_frames
+from livekit.plugins.clova.constants import CLOVA_INPUT_SAMPLE_RATE
+
+from .common import resample_audio
+from .log import logger
+from .models import ClovaSttLanguages, ClovaSpeechAPIType
+
+
+class STT(stt.STT):
+    def __init__(
+        self,
+        *,
+        language: ClovaSttLanguages = "en-US",
+        secret: Optional[str] = None,
+        invoke_url: Optional[str] = None,
+        http_session: Optional[aiohttp.ClientSession] = None,
+        use_grpc: bool = False,
+    ):
+        super().__init__(streaming_supported=False)
+        self._secret = secret or os.environ.get("CLOVA_STT_SECRET_KEY")
+        self._invoke_url = invoke_url or os.environ.get("CLOVA_STT_INVOKE_URL")
+        self._language = language
+        self._session = http_session
+        self._use_grpc = use_grpc
+        if self._secret is None:
+            raise ValueError(
+                "Clova STT secret key is required. It should be set with env CLOVA_STT_SECRET_KEY"
+            )
+
+    def _ensure_session(self) -> aiohttp.ClientSession:
+        if not self._session:
+            self._session = utils.http_context.http_session()
+        return self._session
+
+    def url_builder(
+        self, process_method: ClovaSpeechAPIType = "recognizer/upload"
+    ) -> str:
+        return f"{self._invoke_url}/{process_method}"
+
+    async def recognize(
+        self,
+        *,
+        buffer: AudioBuffer,
+        language: Union[ClovaSttLanguages, str, None] = None,
+    ) -> stt.SpeechEvent:
+        url = self.url_builder()
+        payload = json.dumps({"language": self._language, "completion": "sync"})
+
+        buffer = merge_frames(buffer)
+        buffer_bytes = resample_audio(
+            buffer.data.tobytes(), buffer.sample_rate, CLOVA_INPUT_SAMPLE_RATE
+        )
+
+        io_buffer = io.BytesIO()
+        with wave.open(io_buffer, "wb") as wav:
+            wav.setnchannels(1)
+            wav.setsampwidth(2)  # 16-bit
+            wav.setframerate(CLOVA_INPUT_SAMPLE_RATE)
+            wav.writeframes(buffer_bytes)
+        io_buffer.seek(0)
+
+        headers = {"X-CLOVASPEECH-API-KEY": self._secret}
+        form_data = aiohttp.FormData()
+        form_data.add_field("params", payload)
+        form_data.add_field(
+            "media", io_buffer, filename="audio.wav", content_type="audio/wav"
+        )
+
+        async with self._ensure_session().post(
+            url, data=form_data, headers=headers
+        ) as response:
+            response_data = await response.json()
+            text = response_data.get("text")
+
+            if not text or "error" in response_data:
+                raise ValueError(f"Unexpected response: {response_data}")
+
+            logger.info(f"final event: {response_data}")
+            return self._transcription_to_speech_event(text=text)
+
+    def _transcription_to_speech_event(self, text: str) -> stt.SpeechEvent:
+        return stt.SpeechEvent(
+            type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
+            alternatives=[stt.SpeechData(text=text, language=self._language)],
+        )
diff --git a/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/version.py b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/version.py
new file mode 100644
index 0000000000..b57ea55083
--- /dev/null
+++ b/livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/version.py
@@ -0,0 +1,15 @@
+# Copyright 2023 LiveKit, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__version__ = "0.0.1"
diff --git a/livekit-plugins/livekit-plugins-clova/pyproject.toml b/livekit-plugins/livekit-plugins-clova/pyproject.toml
new file mode 100644
index 0000000000..8cf32563a5
--- /dev/null
+++ b/livekit-plugins/livekit-plugins-clova/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
\ No newline at end of file
diff --git a/livekit-plugins/livekit-plugins-clova/setup.py b/livekit-plugins/livekit-plugins-clova/setup.py
new file mode 100644
index 0000000000..8a2a1687e2
--- /dev/null
+++ b/livekit-plugins/livekit-plugins-clova/setup.py
@@ -0,0 +1,58 @@
+# Copyright 2023 LiveKit, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pathlib
+
+import setuptools
+import setuptools.command.build_py
+
+here = pathlib.Path(__file__).parent.resolve()
+about = {}
+with open(os.path.join(here, "livekit", "plugins", "clova", "version.py"), "r") as f:
+    exec(f.read(), about)
+
+
+setuptools.setup(
+    name="livekit-plugins-clova",
+    version=about["__version__"],
+    description="LiveKit Agents Plugin for LINE Clova STT",
+    long_description=(here / "README.md").read_text(encoding="utf-8"),
+    long_description_content_type="text/markdown",
+    url="https://github.com/livekit/agents",
+    cmdclass={},
+    classifiers=[
+        "Intended Audience :: Developers",
+        "License :: OSI Approved :: Apache Software License",
+        "Topic :: Multimedia :: Sound/Audio",
+        "Topic :: Multimedia :: Video",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3 :: Only",
+    ],
+    keywords=["webrtc", "realtime", "audio", "video", "livekit"],
+    license="Apache-2.0",
+    packages=setuptools.find_namespace_packages(include=["livekit.*"]),
+    python_requires=">=3.9.0",
+    install_requires=[
+        "livekit-agents~=0.7",
+    ],
+    project_urls={
+        "Documentation": "https://docs.livekit.io",
+        "Website": "https://livekit.io/",
+        "Source": "https://github.com/livekit/agents",
+    },
+)