From a0999389c38f986abd6bd29bd4c73e16a55a3b47 Mon Sep 17 00:00:00 2001 From: ftnext Date: Tue, 3 Dec 2024 22:39:29 +0900 Subject: [PATCH 01/14] [feat] Draft Groq recognizer --- speech_recognition/recognizers/groq.py | 33 ++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 speech_recognition/recognizers/groq.py diff --git a/speech_recognition/recognizers/groq.py b/speech_recognition/recognizers/groq.py new file mode 100644 index 00000000..2240806d --- /dev/null +++ b/speech_recognition/recognizers/groq.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +import os +from io import BytesIO + +from speech_recognition.audio import AudioData +from speech_recognition.exceptions import SetupError + + +def recognize_groq( + recognizer, + audio_data: "AudioData", + *, + model: str = "whisper-large-v3-turbo", +): + if not isinstance(audio_data, AudioData): + raise ValueError("``audio_data`` must be an ``AudioData`` instance") + if os.environ.get("OPENAI_API_KEY") is None: + raise SetupError("Set environment variable ``OPENAI_API_KEY``") + + try: + import groq + except ImportError: + raise SetupError( + "missing openai module: ensure that groq is set up correctly." + ) + + wav_data = BytesIO(audio_data.get_wav_data()) + wav_data.name = "SpeechRecognition_audio.wav" + + client = groq.Groq() + transcript = client.audio.transcriptions.create(file=wav_data, model=model) + return transcript.text From 4ed4268700a715abed2120f12596bafd0625476f Mon Sep 17 00:00:00 2001 From: ftnext Date: Tue, 3 Dec 2024 22:40:26 +0900 Subject: [PATCH 02/14] [feat] Add Groq to Recognizer --- speech_recognition/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 253ab0fe..238d5e50 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1506,12 +1506,13 @@ def flush(self, *args, **kwargs): # At this time, the dependencies are not yet installed, resulting in a ModuleNotFoundError. # This is a workaround to resolve this issue try: - from .recognizers import google, whisper + from .recognizers import google, groq, whisper except (ModuleNotFoundError, ImportError): pass else: Recognizer.recognize_google = google.recognize_legacy Recognizer.recognize_whisper_api = whisper.recognize_whisper_api + Recognizer.recognize_groq = groq.recognize_groq # =============================== From 0e697c3d3a0b77efc9a1240a201e0d069863bd73 Mon Sep 17 00:00:00 2001 From: ftnext Date: Tue, 3 Dec 2024 22:40:47 +0900 Subject: [PATCH 03/14] [chore] Install with groq extra --- setup.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.cfg b/setup.cfg index dac06ca2..4f296a1e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -18,5 +18,7 @@ whisper-local = soundfile whisper-api = openai +groq = + groq assemblyai = requests From 4df5584c7ab552e4abaf855c1d512392983d014d Mon Sep 17 00:00:00 2001 From: ftnext Date: Wed, 4 Dec 2024 01:18:20 +0900 Subject: [PATCH 04/14] [bugfix] Correct environment variable name --- speech_recognition/recognizers/groq.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/speech_recognition/recognizers/groq.py b/speech_recognition/recognizers/groq.py index 2240806d..4cedf1b5 100644 --- a/speech_recognition/recognizers/groq.py +++ b/speech_recognition/recognizers/groq.py @@ -15,14 +15,14 @@ def recognize_groq( ): if not isinstance(audio_data, AudioData): raise ValueError("``audio_data`` must be an ``AudioData`` instance") - if os.environ.get("OPENAI_API_KEY") is None: - raise SetupError("Set environment variable ``OPENAI_API_KEY``") + if os.environ.get("GROQ_API_KEY") is None: + raise SetupError("Set environment variable ``GROQ_API_KEY``") try: import groq except ImportError: raise SetupError( - "missing openai module: ensure that groq is set up correctly." + "missing groq module: ensure that groq is set up correctly." ) wav_data = BytesIO(audio_data.get_wav_data()) From 7ad32b317c0c9c432351f534c6a5e5e2406ad6b8 Mon Sep 17 00:00:00 2001 From: ftnext Date: Wed, 4 Dec 2024 01:23:18 +0900 Subject: [PATCH 05/14] [refactor] Distinguish client creation from client use --- speech_recognition/recognizers/groq.py | 16 +++++--------- speech_recognition/recognizers/whisper_api.py | 22 +++++++++++++++++++ 2 files changed, 28 insertions(+), 10 deletions(-) create mode 100644 speech_recognition/recognizers/whisper_api.py diff --git a/speech_recognition/recognizers/groq.py b/speech_recognition/recognizers/groq.py index 4cedf1b5..ae76b329 100644 --- a/speech_recognition/recognizers/groq.py +++ b/speech_recognition/recognizers/groq.py @@ -1,10 +1,12 @@ from __future__ import annotations import os -from io import BytesIO from speech_recognition.audio import AudioData from speech_recognition.exceptions import SetupError +from speech_recognition.recognizers.whisper_api import ( + OpenAICompatibleRecognizer, +) def recognize_groq( @@ -12,9 +14,7 @@ def recognize_groq( audio_data: "AudioData", *, model: str = "whisper-large-v3-turbo", -): - if not isinstance(audio_data, AudioData): - raise ValueError("``audio_data`` must be an ``AudioData`` instance") +) -> str: if os.environ.get("GROQ_API_KEY") is None: raise SetupError("Set environment variable ``GROQ_API_KEY``") @@ -25,9 +25,5 @@ def recognize_groq( "missing groq module: ensure that groq is set up correctly." ) - wav_data = BytesIO(audio_data.get_wav_data()) - wav_data.name = "SpeechRecognition_audio.wav" - - client = groq.Groq() - transcript = client.audio.transcriptions.create(file=wav_data, model=model) - return transcript.text + recognizer = OpenAICompatibleRecognizer(groq.Groq()) + return recognizer.recognize(audio_data, model) diff --git a/speech_recognition/recognizers/whisper_api.py b/speech_recognition/recognizers/whisper_api.py new file mode 100644 index 00000000..f2b3e711 --- /dev/null +++ b/speech_recognition/recognizers/whisper_api.py @@ -0,0 +1,22 @@ +from io import BytesIO + +from speech_recognition.audio import AudioData + + +class OpenAICompatibleRecognizer: + def __init__(self, client) -> None: + self.client = client + + def recognize(self, audio_data: "AudioData", model: str) -> str: + if not isinstance(audio_data, AudioData): + raise ValueError( + "``audio_data`` must be an ``AudioData`` instance" + ) + + wav_data = BytesIO(audio_data.get_wav_data()) + wav_data.name = "SpeechRecognition_audio.wav" + + transcript = self.client.audio.transcriptions.create( + file=wav_data, model=model + ) + return transcript.text From 7157b396fdbf8eddeeaf0aad08e6f97a70255a0d Mon Sep 17 00:00:00 2001 From: ftnext Date: Wed, 4 Dec 2024 23:15:14 +0900 Subject: [PATCH 06/14] [test] Groq transcription --- setup.cfg | 1 + tests/recognizers/test_groq.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 tests/recognizers/test_groq.py diff --git a/setup.cfg b/setup.cfg index 4f296a1e..1063f5e2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -9,6 +9,7 @@ dev = rstcheck pytest pytest-randomly + respx audio = PyAudio >= 0.2.11 pocketsphinx = diff --git a/tests/recognizers/test_groq.py b/tests/recognizers/test_groq.py new file mode 100644 index 00000000..37e19cbc --- /dev/null +++ b/tests/recognizers/test_groq.py @@ -0,0 +1,29 @@ +from unittest.mock import MagicMock + +import httpx +import respx + +from speech_recognition import AudioData, Recognizer +from speech_recognition.recognizers import groq + + +@respx.mock(assert_all_called=True, assert_all_mocked=True) +def test_transcribe_with_groq_whisper(respx_mock): + respx_mock.post("https://api.groq.com/openai/v1/audio/transcriptions").mock( + return_value=httpx.Response( + 200, + json={ + "text": "Transcription by Groq Whisper", + "x_groq": {"id": "req_unique_id"}, + }, + ) + ) + + audio_data = MagicMock(spec=AudioData) + audio_data.get_wav_data.return_value = b"audio_data" + + actual = groq.recognize_groq( + MagicMock(spec=Recognizer), audio_data, model="whisper-large-v3" + ) + + assert actual == "Transcription by Groq Whisper" From 33a78b71f05fc6525d1c6ba208bfaa74eb93b7be Mon Sep 17 00:00:00 2001 From: ftnext Date: Wed, 4 Dec 2024 23:20:00 +0900 Subject: [PATCH 07/14] [test] Developers can unset environment variable for groq test --- tests/recognizers/test_groq.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/recognizers/test_groq.py b/tests/recognizers/test_groq.py index 37e19cbc..11f0426d 100644 --- a/tests/recognizers/test_groq.py +++ b/tests/recognizers/test_groq.py @@ -8,7 +8,9 @@ @respx.mock(assert_all_called=True, assert_all_mocked=True) -def test_transcribe_with_groq_whisper(respx_mock): +def test_transcribe_with_groq_whisper(respx_mock, monkeypatch): + monkeypatch.setenv("GROQ_API_KEY", "gsk_grok_api_key") + respx_mock.post("https://api.groq.com/openai/v1/audio/transcriptions").mock( return_value=httpx.Response( 200, From 5f0437ab3d682eb02da7e823e93790e6c7cd703a Mon Sep 17 00:00:00 2001 From: ftnext Date: Thu, 5 Dec 2024 01:21:06 +0900 Subject: [PATCH 08/14] [feat] Support Groq's optional parameters --- speech_recognition/recognizers/groq.py | 22 ++++++++++++++++++- speech_recognition/recognizers/whisper_api.py | 4 ++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/speech_recognition/recognizers/groq.py b/speech_recognition/recognizers/groq.py index ae76b329..084f793f 100644 --- a/speech_recognition/recognizers/groq.py +++ b/speech_recognition/recognizers/groq.py @@ -1,6 +1,8 @@ from __future__ import annotations import os +from typing import Literal, TypedDict +from typing_extensions import Unpack from speech_recognition.audio import AudioData from speech_recognition.exceptions import SetupError @@ -8,12 +10,30 @@ OpenAICompatibleRecognizer, ) +# https://console.groq.com/docs/speech-text#supported-models +GroqModel = Literal[ + "whisper-large-v3-turbo", "whisper-large-v3", "distil-whisper-large-v3-en" +] + + +class GroqOptionalParameters(TypedDict): + """Groq speech transcription's optional parameters. + + https://console.groq.com/docs/speech-text#transcription-endpoint-usage + """ + + prompt: str + response_format: str + temperature: float + language: str + def recognize_groq( recognizer, audio_data: "AudioData", *, - model: str = "whisper-large-v3-turbo", + model: GroqModel = "whisper-large-v3-turbo", + **kwargs: Unpack[GroqOptionalParameters], ) -> str: if os.environ.get("GROQ_API_KEY") is None: raise SetupError("Set environment variable ``GROQ_API_KEY``") diff --git a/speech_recognition/recognizers/whisper_api.py b/speech_recognition/recognizers/whisper_api.py index f2b3e711..c435ef59 100644 --- a/speech_recognition/recognizers/whisper_api.py +++ b/speech_recognition/recognizers/whisper_api.py @@ -7,7 +7,7 @@ class OpenAICompatibleRecognizer: def __init__(self, client) -> None: self.client = client - def recognize(self, audio_data: "AudioData", model: str) -> str: + def recognize(self, audio_data: "AudioData", model: str, **kwargs) -> str: if not isinstance(audio_data, AudioData): raise ValueError( "``audio_data`` must be an ``AudioData`` instance" @@ -17,6 +17,6 @@ def recognize(self, audio_data: "AudioData", model: str) -> str: wav_data.name = "SpeechRecognition_audio.wav" transcript = self.client.audio.transcriptions.create( - file=wav_data, model=model + file=wav_data, model=model, **kwargs ) return transcript.text From 53a86113f102c0f40c6e9a3ffcd05fec1bb4af2a Mon Sep 17 00:00:00 2001 From: ftnext Date: Thu, 5 Dec 2024 01:33:31 +0900 Subject: [PATCH 09/14] [docs] Users can see recognize_groq's help --- speech_recognition/recognizers/groq.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/speech_recognition/recognizers/groq.py b/speech_recognition/recognizers/groq.py index 084f793f..b36822f2 100644 --- a/speech_recognition/recognizers/groq.py +++ b/speech_recognition/recognizers/groq.py @@ -35,6 +35,15 @@ def recognize_groq( model: GroqModel = "whisper-large-v3-turbo", **kwargs: Unpack[GroqOptionalParameters], ) -> str: + """ + Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Groq Whisper API. + + This function requires login to Groq; visit https://console.groq.com/login, then generate API Key in `API Keys `__ menu. + + Detail: https://console.groq.com/docs/speech-text + + Raises a ``speech_recognition.exceptions.SetupError`` exception if there are any issues with the groq installation, or the environment variable is missing. + """ if os.environ.get("GROQ_API_KEY") is None: raise SetupError("Set environment variable ``GROQ_API_KEY``") From 41c64a6f3e7c1135dd323e343464ab793cae97d8 Mon Sep 17 00:00:00 2001 From: ftnext Date: Thu, 5 Dec 2024 01:35:02 +0900 Subject: [PATCH 10/14] [docs] Users can read Groq instructions --- README.rst | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index 51809ddf..337f6873 100644 --- a/README.rst +++ b/README.rst @@ -39,7 +39,8 @@ Speech recognition engine/API support: * `Tensorflow `__ * `Vosk API `__ (works offline) * `OpenAI whisper `__ (works offline) -* `Whisper API `__ +* `OpenAI Whisper API `__ +* `Groq Whisper API `__ **Quickstart:** ``pip install SpeechRecognition``. See the "Installing" section for more details. @@ -96,7 +97,8 @@ To use all of the functionality of the library, you should have: * **FLAC encoder** (required only if the system is not x86-based Windows/Linux/OS X) * **Vosk** (required only if you need to use Vosk API speech recognition ``recognizer_instance.recognize_vosk``) * **Whisper** (required only if you need to use Whisper ``recognizer_instance.recognize_whisper``) -* **openai** (required only if you need to use Whisper API speech recognition ``recognizer_instance.recognize_whisper_api``) +* **openai** (required only if you need to use OpenAI Whisper API speech recognition ``recognizer_instance.recognize_whisper_api``) +* **groq** (required only if you need to use Groq Whisper API speech recognition ``recognizer_instance.recognize_groq``) The following requirements are optional, but can improve or extend functionality in some situations: @@ -171,15 +173,24 @@ Whisper is **required if and only if you want to use whisper** (``recognizer_ins You can install it with ``python3 -m pip install SpeechRecognition[whisper-local]``. -Whisper API (for Whisper API users) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +OpenAI Whisper API (for OpenAI Whisper API users) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The library `openai `__ is **required if and only if you want to use Whisper API** (``recognizer_instance.recognize_whisper_api``). +The library `openai `__ is **required if and only if you want to use OpenAI Whisper API** (``recognizer_instance.recognize_whisper_api``). If not installed, everything in the library will still work, except calling ``recognizer_instance.recognize_whisper_api`` will raise an ``RequestError``. You can install it with ``python3 -m pip install SpeechRecognition[whisper-api]``. +Groq Whisper API (for Groq Whisper API users) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The library `groq `__ is **required if and only if you want to use Groq Whisper API** (``recognizer_instance.recognize_groq``). + +If not installed, everything in the library will still work, except calling ``recognizer_instance.recognize_groq`` will raise an ``RequestError``. + +You can install it with ``python3 -m pip install SpeechRecognition[groq]``. + Troubleshooting --------------- From 7611aa51eaf2d0c3a1575593ee7bae23a92f5e83 Mon Sep 17 00:00:00 2001 From: ftnext Date: Thu, 5 Dec 2024 01:35:24 +0900 Subject: [PATCH 11/14] [style] Black format --- tests/recognizers/test_groq.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/recognizers/test_groq.py b/tests/recognizers/test_groq.py index 11f0426d..c821eba5 100644 --- a/tests/recognizers/test_groq.py +++ b/tests/recognizers/test_groq.py @@ -11,7 +11,9 @@ def test_transcribe_with_groq_whisper(respx_mock, monkeypatch): monkeypatch.setenv("GROQ_API_KEY", "gsk_grok_api_key") - respx_mock.post("https://api.groq.com/openai/v1/audio/transcriptions").mock( + respx_mock.post( + "https://api.groq.com/openai/v1/audio/transcriptions" + ).mock( return_value=httpx.Response( 200, json={ From b2b9afe5a0a2d664805e6c68a62d18ade475a740 Mon Sep 17 00:00:00 2001 From: ftnext Date: Thu, 5 Dec 2024 01:41:20 +0900 Subject: [PATCH 12/14] [bugfix] Install groq library in CI --- .github/workflows/unittests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index eef992c5..fd3a67cc 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -53,7 +53,7 @@ jobs: - name: Install Python dependencies (Windows) if: matrix.os == 'windows-latest' run: | - python -m pip install .[dev,whisper-local,whisper-api] + python -m pip install .[dev,whisper-local,whisper-api,groq] - name: Test with unittest run: | pytest --doctest-modules -v speech_recognition/recognizers/ tests/ From ce1962a1f950bc19b8172944f57198b082b18b3b Mon Sep 17 00:00:00 2001 From: ftnext Date: Thu, 5 Dec 2024 08:40:59 +0900 Subject: [PATCH 13/14] [bugfix] Install Groq on all runners --- .github/workflows/unittests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index fd3a67cc..b4685c0c 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -44,12 +44,12 @@ jobs: - name: Install Python dependencies (Ubuntu, <=3.12) if: matrix.os == 'ubuntu-latest' && matrix.python-version != '3.13' run: | - python -m pip install .[dev,audio,pocketsphinx,whisper-local,whisper-api] + python -m pip install .[dev,audio,pocketsphinx,whisper-local,whisper-api,groq] - name: Install Python dependencies (Ubuntu, 3.13) if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.13' run: | python -m pip install standard-aifc setuptools - python -m pip install --no-build-isolation .[dev,audio,pocketsphinx,whisper-api] + python -m pip install --no-build-isolation .[dev,audio,pocketsphinx,whisper-api,groq] - name: Install Python dependencies (Windows) if: matrix.os == 'windows-latest' run: | From 698cd761fc37fd4233d144a92c69d24566118539 Mon Sep 17 00:00:00 2001 From: ftnext Date: Thu, 5 Dec 2024 08:52:32 +0900 Subject: [PATCH 14/14] [bugfix] RESPX is compatible with HTTPX<0.28 --- setup.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.cfg b/setup.cfg index 1063f5e2..0eb7e3d9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,7 +19,9 @@ whisper-local = soundfile whisper-api = openai + httpx < 0.28 groq = groq + httpx < 0.28 assemblyai = requests