From 698d3bf16b0839971b35797970cf112f15cf23df Mon Sep 17 00:00:00 2001 From: ftnext Date: Sun, 26 Nov 2023 23:42:56 +0900 Subject: [PATCH 01/11] [chore] Run tests in Python 3.11 --- .github/workflows/unittests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index bf6233be..e6b770a2 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: true matrix: - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.8", "3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v3 From 277e7b12d23fcdfd6a37d5eac1be651861bea215 Mon Sep 17 00:00:00 2001 From: ftnext Date: Wed, 29 Nov 2023 02:00:21 +0900 Subject: [PATCH 02/11] [test] Recognizer.__init__ --- tests/test_recognition.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/test_recognition.py b/tests/test_recognition.py index a4e5f4a0..201d3307 100644 --- a/tests/test_recognition.py +++ b/tests/test_recognition.py @@ -14,6 +14,17 @@ def setUp(self): self.AUDIO_FILE_ZH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "chinese.flac") self.WHISPER_CONFIG = {"temperature": 0} + def test_recognizer_attributes(self): + r = sr.Recognizer() + + self.assertEqual(r.energy_threshold, 300) + self.assertTrue(r.dynamic_energy_threshold) + self.assertEqual(r.dynamic_energy_adjustment_damping, 0.15) + self.assertEqual(r.dynamic_energy_ratio, 1.5) + self.assertEqual(r.pause_threshold, 0.8) + self.assertEqual(r.phrase_threshold, 0.3) + self.assertEqual(r.non_speaking_duration, 0.5) + def test_sphinx_english(self): r = sr.Recognizer() with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source) From 66368ea34f196bad3b654d4405f64153d0e4f522 Mon Sep 17 00:00:00 2001 From: ftnext Date: Thu, 30 Nov 2023 23:58:15 +0900 Subject: [PATCH 03/11] [docs] Draft type hint (WIP) --- speech_recognition/recognizers/google.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 speech_recognition/recognizers/google.py diff --git a/speech_recognition/recognizers/google.py b/speech_recognition/recognizers/google.py new file mode 100644 index 00000000..16d0bc07 --- /dev/null +++ b/speech_recognition/recognizers/google.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from typing import TypedDict + + +class Alternative(TypedDict): + transcript: str + confidence: float + + +class Result(TypedDict): + alternative: list[Alternative] + + +class GoogleResponse(TypedDict): + result: list[Result] From d580435410e065e6ca14867d6687a23983b29566 Mon Sep 17 00:00:00 2001 From: ftnext Date: Fri, 1 Dec 2023 01:10:44 +0900 Subject: [PATCH 04/11] [docs] Show Google API response as type --- setup.py | 2 +- speech_recognition/recognizers/google.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2b10a084..d83f374f 100644 --- a/setup.py +++ b/setup.py @@ -65,5 +65,5 @@ def run(self): "Topic :: Multimedia :: Sound/Audio :: Speech", ], python_requires=">=3.8", - install_requires=['requests>=2.26.0'], + install_requires=['requests>=2.26.0', "typing-extensions"], ) diff --git a/speech_recognition/recognizers/google.py b/speech_recognition/recognizers/google.py index 16d0bc07..228f7ae8 100644 --- a/speech_recognition/recognizers/google.py +++ b/speech_recognition/recognizers/google.py @@ -2,6 +2,8 @@ from typing import TypedDict +from typing_extensions import NotRequired + class Alternative(TypedDict): transcript: str @@ -10,7 +12,9 @@ class Alternative(TypedDict): class Result(TypedDict): alternative: list[Alternative] + final: bool class GoogleResponse(TypedDict): result: list[Result] + result_index: NotRequired[int] From 72d8973285c878e8873d107ec54dba7d13ac626e Mon Sep 17 00:00:00 2001 From: ftnext Date: Fri, 1 Dec 2023 01:11:35 +0900 Subject: [PATCH 05/11] [docs] Add type hint in recognize_google --- speech_recognition/__init__.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index bbff8dad..e6f06604 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -2,6 +2,8 @@ """Library for performing speech recognition, with support for several engines and APIs, online and offline.""" +from __future__ import annotations + import io import os import tempfile @@ -42,6 +44,7 @@ WaitTimeoutError, ) from .recognizers import whisper +from .recognizers.google import Alternative, Result class AudioSource(object): @@ -716,9 +719,9 @@ def recognize_google(self, audio_data, key=None, language="en-US", pfilter=0, sh actual_result = [] for line in response_text.split("\n"): if not line: continue - result = json.loads(line)["result"] + result: list[Result] = json.loads(line)["result"] if len(result) != 0: - actual_result = result[0] + actual_result: Result = result[0] break # return results @@ -729,10 +732,10 @@ def recognize_google(self, audio_data, key=None, language="en-US", pfilter=0, sh if "confidence" in actual_result["alternative"]: # return alternative with highest confidence score - best_hypothesis = max(actual_result["alternative"], key=lambda alternative: alternative["confidence"]) + best_hypothesis: Alternative = max(actual_result["alternative"], key=lambda alternative: alternative["confidence"]) else: # when there is no confidence available, we arbitrarily choose the first hypothesis. - best_hypothesis = actual_result["alternative"][0] + best_hypothesis: Alternative = actual_result["alternative"][0] if "transcript" not in best_hypothesis: raise UnknownValueError() # https://cloud.google.com/speech-to-text/docs/basics#confidence-values # "Your code should not require the confidence field as it is not guaranteed to be accurate, or even set, in any of the results." From c1b4f0d963f36512845b4e738af248c143f6f3e6 Mon Sep 17 00:00:00 2001 From: ftnext Date: Fri, 1 Dec 2023 01:39:44 +0900 Subject: [PATCH 06/11] [test] Add recognize_google test with mock --- tests/test_recognition.py | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/tests/test_recognition.py b/tests/test_recognition.py index 201d3307..c56e5bc1 100644 --- a/tests/test_recognition.py +++ b/tests/test_recognition.py @@ -1,8 +1,10 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +import http import os import unittest +from unittest.mock import patch, MagicMock import speech_recognition as sr @@ -30,11 +32,6 @@ def test_sphinx_english(self): with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source) self.assertEqual(r.recognize_sphinx(audio), "one two three") - def test_google_english(self): - r = sr.Recognizer() - with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source) - self.assertIn(r.recognize_google(audio), ["123", "1 2 3", "one two three"]) - def test_google_french(self): r = sr.Recognizer() with sr.AudioFile(self.AUDIO_FILE_FR) as source: audio = r.record(source) @@ -108,5 +105,34 @@ def test_whisper_chinese(self): with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source) self.assertEqual(r.recognize_whisper(audio, model="small", language="chinese", **self.WHISPER_CONFIG), u"砸自己的腳") + +class RecognizeGoogleTestCase(unittest.TestCase): + @patch("speech_recognition.urlopen") + @patch("speech_recognition.Request") + def test_return_best_hypothesis_transcript_with_default_parameters(self, Request, urlopen): + response = MagicMock(spec=http.client.HTTPResponse) + urlopen.return_value = response + response.read.return_value = b"""\ +{"result":[]} +{"result":[{"alternative":[{"transcript":"one two three","confidence":0.49585345},{"transcript":"1 2","confidence":0.42899391}],"final":true}],"result_index":0} +""" + # mock has AudioData's attributes (e.g. sample_rate) + audio = MagicMock(spec=sr.audio.AudioData(None, 1, 1)) + audio.sample_rate = 16_000 + r = sr.Recognizer() + + actual = r.recognize_google(audio) + + self.assertEqual(actual, "one two three") + audio.get_flac_data.assert_called_once_with(convert_rate=None, convert_width=2) + Request.assert_called_once_with( + "http://www.google.com/speech-api/v2/recognize?client=chromium&lang=en-US&key=AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw&pFilter=0", + data=audio.get_flac_data.return_value, + headers={"Content-Type": "audio/x-flac; rate=16000"}, + ) + urlopen.assert_called_once_with(Request.return_value, timeout=None) + response.read.assert_called_once_with() + + if __name__ == "__main__": unittest.main() From 4a3fc053dc213d1ea13922ac3afb9f01c3929a6f Mon Sep 17 00:00:00 2001 From: ftnext Date: Fri, 1 Dec 2023 01:49:17 +0900 Subject: [PATCH 07/11] [test] Assert language parameter only --- tests/test_recognition.py | 51 +++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/tests/test_recognition.py b/tests/test_recognition.py index c56e5bc1..819cb39c 100644 --- a/tests/test_recognition.py +++ b/tests/test_recognition.py @@ -32,16 +32,6 @@ def test_sphinx_english(self): with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source) self.assertEqual(r.recognize_sphinx(audio), "one two three") - def test_google_french(self): - r = sr.Recognizer() - with sr.AudioFile(self.AUDIO_FILE_FR) as source: audio = r.record(source) - self.assertEqual(r.recognize_google(audio, language="fr-FR"), u"et c'est la dictée numéro 1") - - def test_google_chinese(self): - r = sr.Recognizer() - with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source) - self.assertEqual(r.recognize_google(audio, language="zh-CN"), u"砸自己的脚") - @unittest.skipUnless("WIT_AI_KEY" in os.environ, "requires Wit.ai key to be specified in WIT_AI_KEY environment variable") def test_wit_english(self): r = sr.Recognizer() @@ -106,32 +96,47 @@ def test_whisper_chinese(self): self.assertEqual(r.recognize_whisper(audio, model="small", language="chinese", **self.WHISPER_CONFIG), u"砸自己的腳") +@patch("speech_recognition.urlopen") +@patch("speech_recognition.Request") class RecognizeGoogleTestCase(unittest.TestCase): - @patch("speech_recognition.urlopen") - @patch("speech_recognition.Request") - def test_return_best_hypothesis_transcript_with_default_parameters(self, Request, urlopen): - response = MagicMock(spec=http.client.HTTPResponse) - urlopen.return_value = response - response.read.return_value = b"""\ + def setUp(self) -> None: + self.response = MagicMock(spec=http.client.HTTPResponse) + self.response.read.return_value = b"""\ {"result":[]} {"result":[{"alternative":[{"transcript":"one two three","confidence":0.49585345},{"transcript":"1 2","confidence":0.42899391}],"final":true}],"result_index":0} """ # mock has AudioData's attributes (e.g. sample_rate) - audio = MagicMock(spec=sr.audio.AudioData(None, 1, 1)) - audio.sample_rate = 16_000 - r = sr.Recognizer() + self.audio = MagicMock(spec=sr.audio.AudioData(None, 1, 1)) - actual = r.recognize_google(audio) + self.r = sr.Recognizer() + + def test_return_best_hypothesis_transcript_with_default_parameters(self, Request, urlopen): + urlopen.return_value = self.response + self.audio.sample_rate = 16_000 + + actual = self.r.recognize_google(self.audio) self.assertEqual(actual, "one two three") - audio.get_flac_data.assert_called_once_with(convert_rate=None, convert_width=2) + self.audio.get_flac_data.assert_called_once_with(convert_rate=None, convert_width=2) Request.assert_called_once_with( "http://www.google.com/speech-api/v2/recognize?client=chromium&lang=en-US&key=AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw&pFilter=0", - data=audio.get_flac_data.return_value, + data=self.audio.get_flac_data.return_value, headers={"Content-Type": "audio/x-flac; rate=16000"}, ) urlopen.assert_called_once_with(Request.return_value, timeout=None) - response.read.assert_called_once_with() + self.response.read.assert_called_once_with() + + def test_specified_language_request(self, Request, urlopen): + urlopen.return_value = self.response + self.audio.sample_rate = 16_000 + + _ = self.r.recognize_google(self.audio, language="zh-CN") + + Request.assert_called_once_with( + "http://www.google.com/speech-api/v2/recognize?client=chromium&lang=zh-CN&key=AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw&pFilter=0", + data=self.audio.get_flac_data.return_value, + headers={"Content-Type": "audio/x-flac; rate=16000"}, + ) if __name__ == "__main__": From 0697776effd3f136b3bc183ca6405be3da017a3d Mon Sep 17 00:00:00 2001 From: ftnext Date: Fri, 1 Dec 2023 01:51:27 +0900 Subject: [PATCH 08/11] [test] Add missing assertion of attribute --- tests/test_recognition.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_recognition.py b/tests/test_recognition.py index 819cb39c..a0d6a3c9 100644 --- a/tests/test_recognition.py +++ b/tests/test_recognition.py @@ -24,6 +24,7 @@ def test_recognizer_attributes(self): self.assertEqual(r.dynamic_energy_adjustment_damping, 0.15) self.assertEqual(r.dynamic_energy_ratio, 1.5) self.assertEqual(r.pause_threshold, 0.8) + self.assertIsNone(r.operation_timeout) self.assertEqual(r.phrase_threshold, 0.3) self.assertEqual(r.non_speaking_duration, 0.5) From 9d70ebbeca444a1d92b8aacfc76ef1897fc6cf18 Mon Sep 17 00:00:00 2001 From: ftnext Date: Fri, 1 Dec 2023 02:05:21 +0900 Subject: [PATCH 09/11] [test] recognize_google each parameter's behavior --- tests/test_recognition.py | 43 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tests/test_recognition.py b/tests/test_recognition.py index a0d6a3c9..c5ec085d 100644 --- a/tests/test_recognition.py +++ b/tests/test_recognition.py @@ -127,6 +127,14 @@ def test_return_best_hypothesis_transcript_with_default_parameters(self, Request urlopen.assert_called_once_with(Request.return_value, timeout=None) self.response.read.assert_called_once_with() + def test_minimum_sample_rate(self, Request, urlopen): + urlopen.return_value = self.response + self.audio.sample_rate = 7_999 + + _ = self.r.recognize_google(self.audio) + + self.audio.get_flac_data.assert_called_once_with(convert_rate=8000, convert_width=2) + def test_specified_language_request(self, Request, urlopen): urlopen.return_value = self.response self.audio.sample_rate = 16_000 @@ -139,6 +147,41 @@ def test_specified_language_request(self, Request, urlopen): headers={"Content-Type": "audio/x-flac; rate=16000"}, ) + def test_specified_key_request(self, Request, urlopen): + urlopen.return_value = self.response + self.audio.sample_rate = 16_000 + + _ = self.r.recognize_google(self.audio, key="awesome-key") + + Request.assert_called_once_with( + "http://www.google.com/speech-api/v2/recognize?client=chromium&lang=en-US&key=awesome-key&pFilter=0", + data=self.audio.get_flac_data.return_value, + headers={"Content-Type": "audio/x-flac; rate=16000"}, + ) + + def test_show_all(self, Request, urlopen): + urlopen.return_value = self.response + self.audio.sample_rate = 16_000 + + actual = self.r.recognize_google(self.audio, show_all=True) + + expected = { + "alternative": [ + {"transcript": "one two three", "confidence": 0.49585345}, + {"transcript": "1 2", "confidence": 0.42899391} + ], + "final": True + } + self.assertEqual(actual, expected) + + def test_with_confidence(self, Request, urlopen): + urlopen.return_value = self.response + self.audio.sample_rate = 16_000 + + actual = self.r.recognize_google(self.audio, with_confidence=True) + + self.assertEqual(actual, ("one two three", 0.49585345)) + if __name__ == "__main__": unittest.main() From 6464414c2b4c9d0041ccb2c7a212b4eba99b23f7 Mon Sep 17 00:00:00 2001 From: ftnext Date: Fri, 1 Dec 2023 07:52:20 +0900 Subject: [PATCH 10/11] [bugfix] No module named 'typing_extensions' at pip install . --- speech_recognition/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index e6f06604..9f270d9a 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -21,6 +21,7 @@ import hmac import time import uuid +from typing import TYPE_CHECKING try: import requests @@ -44,7 +45,8 @@ WaitTimeoutError, ) from .recognizers import whisper -from .recognizers.google import Alternative, Result +if TYPE_CHECKING: + from .recognizers.google import Alternative, Result class AudioSource(object): From 285ac3ada933b117bc1260cd11c55bede1cb7fff Mon Sep 17 00:00:00 2001 From: ftnext Date: Fri, 1 Dec 2023 07:57:12 +0900 Subject: [PATCH 11/11] [style] isort __init__.py --- speech_recognition/__init__.py | 59 +++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 9f270d9a..3d47c9b6 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -4,50 +4,50 @@ from __future__ import annotations -import io -import os -import tempfile -import sys -import subprocess -import wave import aifc -import math import audioop -import collections -import json import base64 -import threading +import collections import hashlib import hmac +import io +import json +import math +import os +import subprocess +import sys +import tempfile +import threading import time import uuid +import wave from typing import TYPE_CHECKING +from urllib.error import HTTPError, URLError +from urllib.parse import urlencode +from urllib.request import Request, urlopen try: import requests except (ModuleNotFoundError, ImportError): pass -__author__ = "Anthony Zhang (Uberi)" -__version__ = "3.10.0" -__license__ = "BSD" - -from urllib.parse import urlencode -from urllib.request import Request, urlopen -from urllib.error import URLError, HTTPError - from .audio import AudioData, get_flac_converter from .exceptions import ( RequestError, - TranscriptionFailed, + TranscriptionFailed, TranscriptionNotReady, UnknownValueError, WaitTimeoutError, ) from .recognizers import whisper + if TYPE_CHECKING: from .recognizers.google import Alternative, Result +__author__ = "Anthony Zhang (Uberi)" +__version__ = "3.10.0" +__license__ = "BSD" + class AudioSource(object): def __init__(self): @@ -602,7 +602,7 @@ def recognize_sphinx(self, audio_data, language="en-US", keyword_entries=None, g # import the PocketSphinx speech recognition module try: - from pocketsphinx import pocketsphinx, Jsgf, FsgModel + from pocketsphinx import FsgModel, Jsgf, pocketsphinx except ImportError: raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.") @@ -768,8 +768,9 @@ def recognize_google_cloud(self, audio_data, credentials_json=None, language="en try: import socket - from google.cloud import speech + from google.api_core.exceptions import GoogleAPICallError + from google.cloud import speech except ImportError: raise RequestError('missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly.') @@ -877,7 +878,9 @@ def recognize_azure(self, audio_data, key, language="en-US", profanity="masked", access_token, expire_time = getattr(self, "azure_cached_access_token", None), getattr(self, "azure_cached_access_token_expiry", None) allow_caching = True try: - from time import monotonic # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+ + from time import ( + monotonic, # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+ + ) except ImportError: expire_time = None # monotonic time not available, don't cache access tokens allow_caching = False # don't allow caching, since monotonic time isn't available @@ -969,7 +972,9 @@ def recognize_bing(self, audio_data, key, language="en-US", show_all=False): access_token, expire_time = getattr(self, "bing_cached_access_token", None), getattr(self, "bing_cached_access_token_expiry", None) allow_caching = True try: - from time import monotonic # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+ + from time import ( + monotonic, # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+ + ) except ImportError: expire_time = None # monotonic time not available, don't cache access tokens allow_caching = False # don't allow caching, since monotonic time isn't available @@ -1135,9 +1140,10 @@ def recognize_amazon(self, audio_data, bucket_name=None, access_key_id=None, sec assert access_key_id is None or isinstance(access_key_id, str), "``access_key_id`` must be a string" assert secret_access_key is None or isinstance(secret_access_key, str), "``secret_access_key`` must be a string" assert region is None or isinstance(region, str), "``region`` must be a string" + import multiprocessing import traceback import uuid - import multiprocessing + from botocore.exceptions import ClientError proc = multiprocessing.current_process() @@ -1213,7 +1219,8 @@ def recognize_amazon(self, audio_data, bucket_name=None, access_key_id=None, sec # Retrieve transcription JSON containing transcript. transcript_uri = job['Transcript']['TranscriptFileUri'] - import urllib.request, json + import json + import urllib.request with urllib.request.urlopen(transcript_uri) as json_data: d = json.load(json_data) confidences = [] @@ -1509,7 +1516,7 @@ def recognize_whisper(self, audio_data, model="base", show_dict=False, load_opti recognize_whisper_api = whisper.recognize_whisper_api def recognize_vosk(self, audio_data, language='en'): - from vosk import Model, KaldiRecognizer + from vosk import KaldiRecognizer, Model assert isinstance(audio_data, AudioData), "Data must be audio data"