diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index bf6233be..e6b770a2 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: true matrix: - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.8", "3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v3 diff --git a/setup.py b/setup.py index 2b10a084..d83f374f 100644 --- a/setup.py +++ b/setup.py @@ -65,5 +65,5 @@ def run(self): "Topic :: Multimedia :: Sound/Audio :: Speech", ], python_requires=">=3.8", - install_requires=['requests>=2.26.0'], + install_requires=['requests>=2.26.0', "typing-extensions"], ) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index bbff8dad..3d47c9b6 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -2,47 +2,52 @@ """Library for performing speech recognition, with support for several engines and APIs, online and offline.""" -import io -import os -import tempfile -import sys -import subprocess -import wave +from __future__ import annotations + import aifc -import math import audioop -import collections -import json import base64 -import threading +import collections import hashlib import hmac +import io +import json +import math +import os +import subprocess +import sys +import tempfile +import threading import time import uuid +import wave +from typing import TYPE_CHECKING +from urllib.error import HTTPError, URLError +from urllib.parse import urlencode +from urllib.request import Request, urlopen try: import requests except (ModuleNotFoundError, ImportError): pass -__author__ = "Anthony Zhang (Uberi)" -__version__ = "3.10.0" -__license__ = "BSD" - -from urllib.parse import urlencode -from urllib.request import Request, urlopen -from urllib.error import URLError, HTTPError - from .audio import AudioData, get_flac_converter from .exceptions import ( RequestError, - TranscriptionFailed, + TranscriptionFailed, TranscriptionNotReady, UnknownValueError, WaitTimeoutError, ) from .recognizers import whisper +if TYPE_CHECKING: + from .recognizers.google import Alternative, Result + +__author__ = "Anthony Zhang (Uberi)" +__version__ = "3.10.0" +__license__ = "BSD" + class AudioSource(object): def __init__(self): @@ -597,7 +602,7 @@ def recognize_sphinx(self, audio_data, language="en-US", keyword_entries=None, g # import the PocketSphinx speech recognition module try: - from pocketsphinx import pocketsphinx, Jsgf, FsgModel + from pocketsphinx import FsgModel, Jsgf, pocketsphinx except ImportError: raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.") @@ -716,9 +721,9 @@ def recognize_google(self, audio_data, key=None, language="en-US", pfilter=0, sh actual_result = [] for line in response_text.split("\n"): if not line: continue - result = json.loads(line)["result"] + result: list[Result] = json.loads(line)["result"] if len(result) != 0: - actual_result = result[0] + actual_result: Result = result[0] break # return results @@ -729,10 +734,10 @@ def recognize_google(self, audio_data, key=None, language="en-US", pfilter=0, sh if "confidence" in actual_result["alternative"]: # return alternative with highest confidence score - best_hypothesis = max(actual_result["alternative"], key=lambda alternative: alternative["confidence"]) + best_hypothesis: Alternative = max(actual_result["alternative"], key=lambda alternative: alternative["confidence"]) else: # when there is no confidence available, we arbitrarily choose the first hypothesis. - best_hypothesis = actual_result["alternative"][0] + best_hypothesis: Alternative = actual_result["alternative"][0] if "transcript" not in best_hypothesis: raise UnknownValueError() # https://cloud.google.com/speech-to-text/docs/basics#confidence-values # "Your code should not require the confidence field as it is not guaranteed to be accurate, or even set, in any of the results." @@ -763,8 +768,9 @@ def recognize_google_cloud(self, audio_data, credentials_json=None, language="en try: import socket - from google.cloud import speech + from google.api_core.exceptions import GoogleAPICallError + from google.cloud import speech except ImportError: raise RequestError('missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly.') @@ -872,7 +878,9 @@ def recognize_azure(self, audio_data, key, language="en-US", profanity="masked", access_token, expire_time = getattr(self, "azure_cached_access_token", None), getattr(self, "azure_cached_access_token_expiry", None) allow_caching = True try: - from time import monotonic # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+ + from time import ( + monotonic, # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+ + ) except ImportError: expire_time = None # monotonic time not available, don't cache access tokens allow_caching = False # don't allow caching, since monotonic time isn't available @@ -964,7 +972,9 @@ def recognize_bing(self, audio_data, key, language="en-US", show_all=False): access_token, expire_time = getattr(self, "bing_cached_access_token", None), getattr(self, "bing_cached_access_token_expiry", None) allow_caching = True try: - from time import monotonic # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+ + from time import ( + monotonic, # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+ + ) except ImportError: expire_time = None # monotonic time not available, don't cache access tokens allow_caching = False # don't allow caching, since monotonic time isn't available @@ -1130,9 +1140,10 @@ def recognize_amazon(self, audio_data, bucket_name=None, access_key_id=None, sec assert access_key_id is None or isinstance(access_key_id, str), "``access_key_id`` must be a string" assert secret_access_key is None or isinstance(secret_access_key, str), "``secret_access_key`` must be a string" assert region is None or isinstance(region, str), "``region`` must be a string" + import multiprocessing import traceback import uuid - import multiprocessing + from botocore.exceptions import ClientError proc = multiprocessing.current_process() @@ -1208,7 +1219,8 @@ def recognize_amazon(self, audio_data, bucket_name=None, access_key_id=None, sec # Retrieve transcription JSON containing transcript. transcript_uri = job['Transcript']['TranscriptFileUri'] - import urllib.request, json + import json + import urllib.request with urllib.request.urlopen(transcript_uri) as json_data: d = json.load(json_data) confidences = [] @@ -1504,7 +1516,7 @@ def recognize_whisper(self, audio_data, model="base", show_dict=False, load_opti recognize_whisper_api = whisper.recognize_whisper_api def recognize_vosk(self, audio_data, language='en'): - from vosk import Model, KaldiRecognizer + from vosk import KaldiRecognizer, Model assert isinstance(audio_data, AudioData), "Data must be audio data" diff --git a/speech_recognition/recognizers/google.py b/speech_recognition/recognizers/google.py new file mode 100644 index 00000000..228f7ae8 --- /dev/null +++ b/speech_recognition/recognizers/google.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +from typing import TypedDict + +from typing_extensions import NotRequired + + +class Alternative(TypedDict): + transcript: str + confidence: float + + +class Result(TypedDict): + alternative: list[Alternative] + final: bool + + +class GoogleResponse(TypedDict): + result: list[Result] + result_index: NotRequired[int] diff --git a/tests/test_recognition.py b/tests/test_recognition.py index a4e5f4a0..c5ec085d 100644 --- a/tests/test_recognition.py +++ b/tests/test_recognition.py @@ -1,8 +1,10 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +import http import os import unittest +from unittest.mock import patch, MagicMock import speech_recognition as sr @@ -14,25 +16,22 @@ def setUp(self): self.AUDIO_FILE_ZH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "chinese.flac") self.WHISPER_CONFIG = {"temperature": 0} - def test_sphinx_english(self): + def test_recognizer_attributes(self): r = sr.Recognizer() - with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source) - self.assertEqual(r.recognize_sphinx(audio), "one two three") - def test_google_english(self): - r = sr.Recognizer() - with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source) - self.assertIn(r.recognize_google(audio), ["123", "1 2 3", "one two three"]) + self.assertEqual(r.energy_threshold, 300) + self.assertTrue(r.dynamic_energy_threshold) + self.assertEqual(r.dynamic_energy_adjustment_damping, 0.15) + self.assertEqual(r.dynamic_energy_ratio, 1.5) + self.assertEqual(r.pause_threshold, 0.8) + self.assertIsNone(r.operation_timeout) + self.assertEqual(r.phrase_threshold, 0.3) + self.assertEqual(r.non_speaking_duration, 0.5) - def test_google_french(self): - r = sr.Recognizer() - with sr.AudioFile(self.AUDIO_FILE_FR) as source: audio = r.record(source) - self.assertEqual(r.recognize_google(audio, language="fr-FR"), u"et c'est la dictée numéro 1") - - def test_google_chinese(self): + def test_sphinx_english(self): r = sr.Recognizer() - with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source) - self.assertEqual(r.recognize_google(audio, language="zh-CN"), u"砸自己的脚") + with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source) + self.assertEqual(r.recognize_sphinx(audio), "one two three") @unittest.skipUnless("WIT_AI_KEY" in os.environ, "requires Wit.ai key to be specified in WIT_AI_KEY environment variable") def test_wit_english(self): @@ -97,5 +96,92 @@ def test_whisper_chinese(self): with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source) self.assertEqual(r.recognize_whisper(audio, model="small", language="chinese", **self.WHISPER_CONFIG), u"砸自己的腳") + +@patch("speech_recognition.urlopen") +@patch("speech_recognition.Request") +class RecognizeGoogleTestCase(unittest.TestCase): + def setUp(self) -> None: + self.response = MagicMock(spec=http.client.HTTPResponse) + self.response.read.return_value = b"""\ +{"result":[]} +{"result":[{"alternative":[{"transcript":"one two three","confidence":0.49585345},{"transcript":"1 2","confidence":0.42899391}],"final":true}],"result_index":0} +""" + # mock has AudioData's attributes (e.g. sample_rate) + self.audio = MagicMock(spec=sr.audio.AudioData(None, 1, 1)) + + self.r = sr.Recognizer() + + def test_return_best_hypothesis_transcript_with_default_parameters(self, Request, urlopen): + urlopen.return_value = self.response + self.audio.sample_rate = 16_000 + + actual = self.r.recognize_google(self.audio) + + self.assertEqual(actual, "one two three") + self.audio.get_flac_data.assert_called_once_with(convert_rate=None, convert_width=2) + Request.assert_called_once_with( + "http://www.google.com/speech-api/v2/recognize?client=chromium&lang=en-US&key=AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw&pFilter=0", + data=self.audio.get_flac_data.return_value, + headers={"Content-Type": "audio/x-flac; rate=16000"}, + ) + urlopen.assert_called_once_with(Request.return_value, timeout=None) + self.response.read.assert_called_once_with() + + def test_minimum_sample_rate(self, Request, urlopen): + urlopen.return_value = self.response + self.audio.sample_rate = 7_999 + + _ = self.r.recognize_google(self.audio) + + self.audio.get_flac_data.assert_called_once_with(convert_rate=8000, convert_width=2) + + def test_specified_language_request(self, Request, urlopen): + urlopen.return_value = self.response + self.audio.sample_rate = 16_000 + + _ = self.r.recognize_google(self.audio, language="zh-CN") + + Request.assert_called_once_with( + "http://www.google.com/speech-api/v2/recognize?client=chromium&lang=zh-CN&key=AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw&pFilter=0", + data=self.audio.get_flac_data.return_value, + headers={"Content-Type": "audio/x-flac; rate=16000"}, + ) + + def test_specified_key_request(self, Request, urlopen): + urlopen.return_value = self.response + self.audio.sample_rate = 16_000 + + _ = self.r.recognize_google(self.audio, key="awesome-key") + + Request.assert_called_once_with( + "http://www.google.com/speech-api/v2/recognize?client=chromium&lang=en-US&key=awesome-key&pFilter=0", + data=self.audio.get_flac_data.return_value, + headers={"Content-Type": "audio/x-flac; rate=16000"}, + ) + + def test_show_all(self, Request, urlopen): + urlopen.return_value = self.response + self.audio.sample_rate = 16_000 + + actual = self.r.recognize_google(self.audio, show_all=True) + + expected = { + "alternative": [ + {"transcript": "one two three", "confidence": 0.49585345}, + {"transcript": "1 2", "confidence": 0.42899391} + ], + "final": True + } + self.assertEqual(actual, expected) + + def test_with_confidence(self, Request, urlopen): + urlopen.return_value = self.response + self.audio.sample_rate = 16_000 + + actual = self.r.recognize_google(self.audio, with_confidence=True) + + self.assertEqual(actual, ("one two three", 0.49585345)) + + if __name__ == "__main__": unittest.main()