-
Notifications
You must be signed in to change notification settings - Fork 504
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f948c36
commit 516fa92
Showing
10 changed files
with
249 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# LiveKit Plugins Clova | ||
|
||
Agent Framework plugin for speech-to-text with [Clova](https://api.ncloud-docs.com/docs/)'s API. Currently supports speech-to-text. | ||
|
||
## Installation | ||
|
||
```bash | ||
pip install livekit-plugins-clova | ||
``` | ||
|
||
## Pre-requisites | ||
|
||
You need invoke url and secret key from Naver cloud platform -> Clova Speech and set as environment variables: `CLOVA_STT_INVOKE_URL` & `CLOVA_STT_SECRET_KEY` |
21 changes: 21 additions & 0 deletions
21
livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
from .stt import STT | ||
from .version import __version__ | ||
|
||
__all__ = [ | ||
"STT", | ||
"__version__", | ||
] | ||
|
||
|
||
from livekit.agents import Plugin | ||
|
||
|
||
class ClovaSTTPlugin(Plugin): | ||
def __init__(self): | ||
super().__init__(__name__, __version__, __package__) | ||
|
||
def download_files(self): | ||
pass | ||
|
||
|
||
Plugin.register_plugin(ClovaSTTPlugin()) |
13 changes: 13 additions & 0 deletions
13
livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/common.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
import io | ||
|
||
from pydub import AudioSegment | ||
|
||
|
||
def resample_audio(audio_bytes, original_sample_rate, target_sample_rate): | ||
resampled_audio = AudioSegment.from_raw( | ||
io.BytesIO(audio_bytes), | ||
sample_width=2, | ||
frame_rate=original_sample_rate, | ||
channels=1, | ||
).set_frame_rate(target_sample_rate) | ||
return resampled_audio.raw_data |
2 changes: 2 additions & 0 deletions
2
livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/constants.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
CLOVA_INPUT_SAMPLE_RATE = 16000 | ||
LIVEKIT_INPUT_SAMPLE_RATE = 48000 |
3 changes: 3 additions & 0 deletions
3
livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/log.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
import logging | ||
|
||
logger = logging.getLogger("livekit.plugins.clova") |
14 changes: 14 additions & 0 deletions
14
livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/models.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
from typing import Literal | ||
|
||
ClovaSttLanguages = Literal[ | ||
"zh", | ||
"zh-CN", | ||
"zh-TW", | ||
"ko", | ||
"ja", | ||
"en", | ||
] | ||
|
||
ClovaSpeechAPIType = Literal[ | ||
"recognizer/object-storage", "recognizer/url", "recognizer/upload" | ||
] |
107 changes: 107 additions & 0 deletions
107
livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/stt.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
# Copyright 2023 LiveKit, Inc. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import io | ||
import json | ||
import os | ||
import wave | ||
from typing import Optional, Union | ||
|
||
import aiohttp | ||
from livekit.agents import stt, utils | ||
from livekit.agents.utils import AudioBuffer, merge_frames | ||
from livekit.plugins.clova.constants import CLOVA_INPUT_SAMPLE_RATE | ||
|
||
from .common import resample_audio | ||
from .log import logger | ||
from .models import ClovaSttLanguages, ClovaSpeechAPIType | ||
|
||
|
||
class STT(stt.STT): | ||
def __init__( | ||
self, | ||
*, | ||
language: ClovaSttLanguages = "en-US", | ||
secret: Optional[str] = None, | ||
invoke_url: Optional[str] = None, | ||
http_session: Optional[aiohttp.ClientSession] = None, | ||
use_grpc: bool = False, | ||
): | ||
super().__init__(streaming_supported=False) | ||
self._secret = secret or os.environ.get("CLOVA_STT_SECRET_KEY") | ||
self._invoke_url = invoke_url or os.environ.get("CLOVA_STT_INVOKE_URL") | ||
self._language = language | ||
self._session = http_session | ||
self._use_grpc = use_grpc | ||
if self._secret is None: | ||
raise ValueError( | ||
"Clova STT secret key is required. It should be set with env CLOVA_STT_SECRET_KEY" | ||
) | ||
|
||
def _ensure_session(self) -> aiohttp.ClientSession: | ||
if not self._session: | ||
self._session = utils.http_context.http_session() | ||
return self._session | ||
|
||
def url_builder( | ||
self, process_method: ClovaSpeechAPIType = "recognizer/upload" | ||
) -> str: | ||
return f"{self._invoke_url}/{process_method}" | ||
|
||
async def recognize( | ||
self, | ||
*, | ||
buffer: AudioBuffer, | ||
language: Union[ClovaSttLanguages, str, None] = None, | ||
) -> stt.SpeechEvent: | ||
url = self.url_builder() | ||
payload = json.dumps({"language": self._language, "completion": "sync"}) | ||
|
||
buffer = merge_frames(buffer) | ||
buffer_bytes = resample_audio( | ||
buffer.data.tobytes(), buffer.sample_rate, CLOVA_INPUT_SAMPLE_RATE | ||
) | ||
|
||
io_buffer = io.BytesIO() | ||
with wave.open(io_buffer, "wb") as wav: | ||
wav.setnchannels(1) | ||
wav.setsampwidth(2) # 16-bit | ||
wav.setframerate(CLOVA_INPUT_SAMPLE_RATE) | ||
wav.writeframes(buffer_bytes) | ||
io_buffer.seek(0) | ||
|
||
headers = {"X-CLOVASPEECH-API-KEY": self._secret} | ||
form_data = aiohttp.FormData() | ||
form_data.add_field("params", payload) | ||
form_data.add_field( | ||
"media", io_buffer, filename="audio.wav", content_type="audio/wav" | ||
) | ||
|
||
async with self._ensure_session().post( | ||
url, data=form_data, headers=headers | ||
) as response: | ||
response_data = await response.json() | ||
text = response_data.get("text") | ||
|
||
if not text or "error" in response_data: | ||
raise ValueError(f"Unexpected response: {response_data}") | ||
|
||
logger.info(f"final event: {response_data}") | ||
return self._transcription_to_speech_event(text=text) | ||
|
||
def _transcription_to_speech_event(self, text: str) -> stt.SpeechEvent: | ||
return stt.SpeechEvent( | ||
type=stt.SpeechEventType.INTERIM_TRANSCRIPT, | ||
alternatives=[stt.SpeechData(text=text, language=self._language)], | ||
) |
15 changes: 15 additions & 0 deletions
15
livekit-plugins/livekit-plugins-clova/livekit/plugins/clova/version.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# Copyright 2023 LiveKit, Inc. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
__version__ = "0.0.1" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
[build-system] | ||
requires = ["setuptools>=61.0"] | ||
build-backend = "setuptools.build_meta" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
# Copyright 2023 LiveKit, Inc. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import os | ||
import pathlib | ||
|
||
import setuptools | ||
import setuptools.command.build_py | ||
|
||
here = pathlib.Path(__file__).parent.resolve() | ||
about = {} | ||
with open(os.path.join(here, "livekit", "plugins", "clova", "version.py"), "r") as f: | ||
exec(f.read(), about) | ||
|
||
|
||
setuptools.setup( | ||
name="livekit-plugins-clova", | ||
version=about["__version__"], | ||
description="LiveKit Agents Plugin for LINE Clova STT", | ||
long_description=(here / "README.md").read_text(encoding="utf-8"), | ||
long_description_content_type="text/markdown", | ||
url="https://github.com/livekit/agents", | ||
cmdclass={}, | ||
classifiers=[ | ||
"Intended Audience :: Developers", | ||
"License :: OSI Approved :: Apache Software License", | ||
"Topic :: Multimedia :: Sound/Audio", | ||
"Topic :: Multimedia :: Video", | ||
"Topic :: Scientific/Engineering :: Artificial Intelligence", | ||
"Programming Language :: Python :: 3", | ||
"Programming Language :: Python :: 3.9", | ||
"Programming Language :: Python :: 3.10", | ||
"Programming Language :: Python :: 3 :: Only", | ||
], | ||
keywords=["webrtc", "realtime", "audio", "video", "livekit"], | ||
license="Apache-2.0", | ||
packages=setuptools.find_namespace_packages(include=["livekit.*"]), | ||
python_requires=">=3.9.0", | ||
install_requires=[ | ||
"livekit-agents~=0.7", | ||
], | ||
project_urls={ | ||
"Documentation": "https://docs.livekit.io", | ||
"Website": "https://livekit.io/", | ||
"Source": "https://github.com/livekit/agents", | ||
}, | ||
) |