diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index eef992c5..b4685c0c 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -44,16 +44,16 @@ jobs: - name: Install Python dependencies (Ubuntu, <=3.12) if: matrix.os == 'ubuntu-latest' && matrix.python-version != '3.13' run: | - python -m pip install .[dev,audio,pocketsphinx,whisper-local,whisper-api] + python -m pip install .[dev,audio,pocketsphinx,whisper-local,whisper-api,groq] - name: Install Python dependencies (Ubuntu, 3.13) if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.13' run: | python -m pip install standard-aifc setuptools - python -m pip install --no-build-isolation .[dev,audio,pocketsphinx,whisper-api] + python -m pip install --no-build-isolation .[dev,audio,pocketsphinx,whisper-api,groq] - name: Install Python dependencies (Windows) if: matrix.os == 'windows-latest' run: | - python -m pip install .[dev,whisper-local,whisper-api] + python -m pip install .[dev,whisper-local,whisper-api,groq] - name: Test with unittest run: | pytest --doctest-modules -v speech_recognition/recognizers/ tests/ diff --git a/README.rst b/README.rst index 51809ddf..337f6873 100644 --- a/README.rst +++ b/README.rst @@ -39,7 +39,8 @@ Speech recognition engine/API support: * `Tensorflow `__ * `Vosk API `__ (works offline) * `OpenAI whisper `__ (works offline) -* `Whisper API `__ +* `OpenAI Whisper API `__ +* `Groq Whisper API `__ **Quickstart:** ``pip install SpeechRecognition``. See the "Installing" section for more details. @@ -96,7 +97,8 @@ To use all of the functionality of the library, you should have: * **FLAC encoder** (required only if the system is not x86-based Windows/Linux/OS X) * **Vosk** (required only if you need to use Vosk API speech recognition ``recognizer_instance.recognize_vosk``) * **Whisper** (required only if you need to use Whisper ``recognizer_instance.recognize_whisper``) -* **openai** (required only if you need to use Whisper API speech recognition ``recognizer_instance.recognize_whisper_api``) +* **openai** (required only if you need to use OpenAI Whisper API speech recognition ``recognizer_instance.recognize_whisper_api``) +* **groq** (required only if you need to use Groq Whisper API speech recognition ``recognizer_instance.recognize_groq``) The following requirements are optional, but can improve or extend functionality in some situations: @@ -171,15 +173,24 @@ Whisper is **required if and only if you want to use whisper** (``recognizer_ins You can install it with ``python3 -m pip install SpeechRecognition[whisper-local]``. -Whisper API (for Whisper API users) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +OpenAI Whisper API (for OpenAI Whisper API users) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The library `openai `__ is **required if and only if you want to use Whisper API** (``recognizer_instance.recognize_whisper_api``). +The library `openai `__ is **required if and only if you want to use OpenAI Whisper API** (``recognizer_instance.recognize_whisper_api``). If not installed, everything in the library will still work, except calling ``recognizer_instance.recognize_whisper_api`` will raise an ``RequestError``. You can install it with ``python3 -m pip install SpeechRecognition[whisper-api]``. +Groq Whisper API (for Groq Whisper API users) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The library `groq `__ is **required if and only if you want to use Groq Whisper API** (``recognizer_instance.recognize_groq``). + +If not installed, everything in the library will still work, except calling ``recognizer_instance.recognize_groq`` will raise an ``RequestError``. + +You can install it with ``python3 -m pip install SpeechRecognition[groq]``. + Troubleshooting --------------- diff --git a/setup.cfg b/setup.cfg index 2124911a..dbba0147 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,6 +4,7 @@ dev = rstcheck pytest pytest-randomly + respx audio = PyAudio >= 0.2.11 pocketsphinx = @@ -13,5 +14,9 @@ whisper-local = soundfile whisper-api = openai + httpx < 0.28 +groq = + groq + httpx < 0.28 assemblyai = requests diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 253ab0fe..238d5e50 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1506,12 +1506,13 @@ def flush(self, *args, **kwargs): # At this time, the dependencies are not yet installed, resulting in a ModuleNotFoundError. # This is a workaround to resolve this issue try: - from .recognizers import google, whisper + from .recognizers import google, groq, whisper except (ModuleNotFoundError, ImportError): pass else: Recognizer.recognize_google = google.recognize_legacy Recognizer.recognize_whisper_api = whisper.recognize_whisper_api + Recognizer.recognize_groq = groq.recognize_groq # =============================== diff --git a/speech_recognition/recognizers/groq.py b/speech_recognition/recognizers/groq.py new file mode 100644 index 00000000..b36822f2 --- /dev/null +++ b/speech_recognition/recognizers/groq.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import os +from typing import Literal, TypedDict +from typing_extensions import Unpack + +from speech_recognition.audio import AudioData +from speech_recognition.exceptions import SetupError +from speech_recognition.recognizers.whisper_api import ( + OpenAICompatibleRecognizer, +) + +# https://console.groq.com/docs/speech-text#supported-models +GroqModel = Literal[ + "whisper-large-v3-turbo", "whisper-large-v3", "distil-whisper-large-v3-en" +] + + +class GroqOptionalParameters(TypedDict): + """Groq speech transcription's optional parameters. + + https://console.groq.com/docs/speech-text#transcription-endpoint-usage + """ + + prompt: str + response_format: str + temperature: float + language: str + + +def recognize_groq( + recognizer, + audio_data: "AudioData", + *, + model: GroqModel = "whisper-large-v3-turbo", + **kwargs: Unpack[GroqOptionalParameters], +) -> str: + """ + Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Groq Whisper API. + + This function requires login to Groq; visit https://console.groq.com/login, then generate API Key in `API Keys `__ menu. + + Detail: https://console.groq.com/docs/speech-text + + Raises a ``speech_recognition.exceptions.SetupError`` exception if there are any issues with the groq installation, or the environment variable is missing. + """ + if os.environ.get("GROQ_API_KEY") is None: + raise SetupError("Set environment variable ``GROQ_API_KEY``") + + try: + import groq + except ImportError: + raise SetupError( + "missing groq module: ensure that groq is set up correctly." + ) + + recognizer = OpenAICompatibleRecognizer(groq.Groq()) + return recognizer.recognize(audio_data, model) diff --git a/speech_recognition/recognizers/whisper_api.py b/speech_recognition/recognizers/whisper_api.py new file mode 100644 index 00000000..c435ef59 --- /dev/null +++ b/speech_recognition/recognizers/whisper_api.py @@ -0,0 +1,22 @@ +from io import BytesIO + +from speech_recognition.audio import AudioData + + +class OpenAICompatibleRecognizer: + def __init__(self, client) -> None: + self.client = client + + def recognize(self, audio_data: "AudioData", model: str, **kwargs) -> str: + if not isinstance(audio_data, AudioData): + raise ValueError( + "``audio_data`` must be an ``AudioData`` instance" + ) + + wav_data = BytesIO(audio_data.get_wav_data()) + wav_data.name = "SpeechRecognition_audio.wav" + + transcript = self.client.audio.transcriptions.create( + file=wav_data, model=model, **kwargs + ) + return transcript.text diff --git a/tests/recognizers/test_groq.py b/tests/recognizers/test_groq.py new file mode 100644 index 00000000..c821eba5 --- /dev/null +++ b/tests/recognizers/test_groq.py @@ -0,0 +1,33 @@ +from unittest.mock import MagicMock + +import httpx +import respx + +from speech_recognition import AudioData, Recognizer +from speech_recognition.recognizers import groq + + +@respx.mock(assert_all_called=True, assert_all_mocked=True) +def test_transcribe_with_groq_whisper(respx_mock, monkeypatch): + monkeypatch.setenv("GROQ_API_KEY", "gsk_grok_api_key") + + respx_mock.post( + "https://api.groq.com/openai/v1/audio/transcriptions" + ).mock( + return_value=httpx.Response( + 200, + json={ + "text": "Transcription by Groq Whisper", + "x_groq": {"id": "req_unique_id"}, + }, + ) + ) + + audio_data = MagicMock(spec=AudioData) + audio_data.get_wav_data.return_value = b"audio_data" + + actual = groq.recognize_groq( + MagicMock(spec=Recognizer), audio_data, model="whisper-large-v3" + ) + + assert actual == "Transcription by Groq Whisper"