Merge pull request #716 from ftnext/i715-support-311

Run tests in Python 3.11
Uberi · Dec 1, 2023 · e681993 · e681993
2 parents 8b07762 + 285ac3a
commit e681993
Show file tree

Hide file tree

Showing 5 changed files with 165 additions and 47 deletions.
diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: true
       matrix:
-        python-version: ["3.8", "3.9", "3.10"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
 
     steps:
       - uses: actions/checkout@v3

diff --git a/setup.py b/setup.py
@@ -65,5 +65,5 @@ def run(self):
         "Topic :: Multimedia :: Sound/Audio :: Speech",
     ],
     python_requires=">=3.8",
-    install_requires=['requests>=2.26.0'],
+    install_requires=['requests>=2.26.0', "typing-extensions"],
 )
diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py
@@ -2,47 +2,52 @@
 
 """Library for performing speech recognition, with support for several engines and APIs, online and offline."""
 
-import io
-import os
-import tempfile
-import sys
-import subprocess
-import wave
+from __future__ import annotations
+
 import aifc
-import math
 import audioop
-import collections
-import json
 import base64
-import threading
+import collections
 import hashlib
 import hmac
+import io
+import json
+import math
+import os
+import subprocess
+import sys
+import tempfile
+import threading
 import time
 import uuid
+import wave
+from typing import TYPE_CHECKING
+from urllib.error import HTTPError, URLError
+from urllib.parse import urlencode
+from urllib.request import Request, urlopen
 
 try:
     import requests
 except (ModuleNotFoundError, ImportError):
     pass
 
-__author__ = "Anthony Zhang (Uberi)"
-__version__ = "3.10.0"
-__license__ = "BSD"
-
-from urllib.parse import urlencode
-from urllib.request import Request, urlopen
-from urllib.error import URLError, HTTPError
-
 from .audio import AudioData, get_flac_converter
 from .exceptions import (
     RequestError,
-    TranscriptionFailed, 
+    TranscriptionFailed,
     TranscriptionNotReady,
     UnknownValueError,
     WaitTimeoutError,
 )
 from .recognizers import whisper
 
+if TYPE_CHECKING:
+    from .recognizers.google import Alternative, Result
+
+__author__ = "Anthony Zhang (Uberi)"
+__version__ = "3.10.0"
+__license__ = "BSD"
+
 
 class AudioSource(object):
     def __init__(self):
@@ -597,7 +602,7 @@ def recognize_sphinx(self, audio_data, language="en-US", keyword_entries=None, g
 
         # import the PocketSphinx speech recognition module
         try:
-            from pocketsphinx import pocketsphinx, Jsgf, FsgModel
+            from pocketsphinx import FsgModel, Jsgf, pocketsphinx
 
         except ImportError:
             raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.")
@@ -716,9 +721,9 @@ def recognize_google(self, audio_data, key=None, language="en-US", pfilter=0, sh
         actual_result = []
         for line in response_text.split("\n"):
             if not line: continue
-            result = json.loads(line)["result"]
+            result: list[Result] = json.loads(line)["result"]
             if len(result) != 0:
-                actual_result = result[0]
+                actual_result: Result = result[0]
                 break
 
         # return results
@@ -729,10 +734,10 @@ def recognize_google(self, audio_data, key=None, language="en-US", pfilter=0, sh
 
         if "confidence" in actual_result["alternative"]:
             # return alternative with highest confidence score
-            best_hypothesis = max(actual_result["alternative"], key=lambda alternative: alternative["confidence"])
+            best_hypothesis: Alternative = max(actual_result["alternative"], key=lambda alternative: alternative["confidence"])
         else:
             # when there is no confidence available, we arbitrarily choose the first hypothesis.
-            best_hypothesis = actual_result["alternative"][0]
+            best_hypothesis: Alternative = actual_result["alternative"][0]
         if "transcript" not in best_hypothesis: raise UnknownValueError()
         # https://cloud.google.com/speech-to-text/docs/basics#confidence-values
         # "Your code should not require the confidence field as it is not guaranteed to be accurate, or even set, in any of the results."
@@ -763,8 +768,9 @@ def recognize_google_cloud(self, audio_data, credentials_json=None, language="en
 
         try:
             import socket
-            from google.cloud import speech
+
             from google.api_core.exceptions import GoogleAPICallError
+            from google.cloud import speech
         except ImportError:
             raise RequestError('missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly.')
 
@@ -872,7 +878,9 @@ def recognize_azure(self, audio_data, key, language="en-US", profanity="masked",
         access_token, expire_time = getattr(self, "azure_cached_access_token", None), getattr(self, "azure_cached_access_token_expiry", None)
         allow_caching = True
         try:
-            from time import monotonic  # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+
+            from time import (
+                monotonic,  # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+
+            )
         except ImportError:
             expire_time = None  # monotonic time not available, don't cache access tokens
             allow_caching = False  # don't allow caching, since monotonic time isn't available
@@ -964,7 +972,9 @@ def recognize_bing(self, audio_data, key, language="en-US", show_all=False):
         access_token, expire_time = getattr(self, "bing_cached_access_token", None), getattr(self, "bing_cached_access_token_expiry", None)
         allow_caching = True
         try:
-            from time import monotonic  # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+
+            from time import (
+                monotonic,  # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+
+            )
         except ImportError:
             expire_time = None  # monotonic time not available, don't cache access tokens
             allow_caching = False  # don't allow caching, since monotonic time isn't available
@@ -1130,9 +1140,10 @@ def recognize_amazon(self, audio_data, bucket_name=None, access_key_id=None, sec
         assert access_key_id is None or isinstance(access_key_id, str), "``access_key_id`` must be a string"
         assert secret_access_key is None or isinstance(secret_access_key, str), "``secret_access_key`` must be a string"
         assert region is None or isinstance(region, str), "``region`` must be a string"
+        import multiprocessing
         import traceback
         import uuid
-        import multiprocessing
+
         from botocore.exceptions import ClientError
         proc = multiprocessing.current_process()
 
@@ -1208,7 +1219,8 @@ def recognize_amazon(self, audio_data, bucket_name=None, access_key_id=None, sec
 
                 # Retrieve transcription JSON containing transcript.
                 transcript_uri = job['Transcript']['TranscriptFileUri']
-                import urllib.request, json
+                import json
+                import urllib.request
                 with urllib.request.urlopen(transcript_uri) as json_data:
                     d = json.load(json_data)
                     confidences = []
@@ -1504,7 +1516,7 @@ def recognize_whisper(self, audio_data, model="base", show_dict=False, load_opti
     recognize_whisper_api = whisper.recognize_whisper_api
 
     def recognize_vosk(self, audio_data, language='en'):
-        from vosk import Model, KaldiRecognizer
+        from vosk import KaldiRecognizer, Model
 
         assert isinstance(audio_data, AudioData), "Data must be audio data"
 

diff --git a/speech_recognition/recognizers/google.py b/speech_recognition/recognizers/google.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+
+from typing import TypedDict
+
+from typing_extensions import NotRequired
+
+
+class Alternative(TypedDict):
+    transcript: str
+    confidence: float
+
+
+class Result(TypedDict):
+    alternative: list[Alternative]
+    final: bool
+
+
+class GoogleResponse(TypedDict):
+    result: list[Result]
+    result_index: NotRequired[int]
diff --git a/tests/test_recognition.py b/tests/test_recognition.py
@@ -1,8 +1,10 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
+import http
 import os
 import unittest
+from unittest.mock import patch, MagicMock
 
 import speech_recognition as sr
 
@@ -14,25 +16,22 @@ def setUp(self):
         self.AUDIO_FILE_ZH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "chinese.flac")
         self.WHISPER_CONFIG = {"temperature": 0}
 
-    def test_sphinx_english(self):
+    def test_recognizer_attributes(self):
         r = sr.Recognizer()
-        with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
-        self.assertEqual(r.recognize_sphinx(audio), "one two three")
 
-    def test_google_english(self):
-        r = sr.Recognizer()
-        with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
-        self.assertIn(r.recognize_google(audio), ["123", "1 2 3", "one two three"])
+        self.assertEqual(r.energy_threshold, 300)
+        self.assertTrue(r.dynamic_energy_threshold)
+        self.assertEqual(r.dynamic_energy_adjustment_damping, 0.15)
+        self.assertEqual(r.dynamic_energy_ratio, 1.5)
+        self.assertEqual(r.pause_threshold, 0.8)
+        self.assertIsNone(r.operation_timeout)
+        self.assertEqual(r.phrase_threshold, 0.3)
+        self.assertEqual(r.non_speaking_duration, 0.5)
 
-    def test_google_french(self):
-        r = sr.Recognizer()
-        with sr.AudioFile(self.AUDIO_FILE_FR) as source: audio = r.record(source)
-        self.assertEqual(r.recognize_google(audio, language="fr-FR"), u"et c'est la dictée numéro 1")
-
-    def test_google_chinese(self):
+    def test_sphinx_english(self):
         r = sr.Recognizer()
-        with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source)
-        self.assertEqual(r.recognize_google(audio, language="zh-CN"), u"砸自己的脚")
+        with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
+        self.assertEqual(r.recognize_sphinx(audio), "one two three")
 
     @unittest.skipUnless("WIT_AI_KEY" in os.environ, "requires Wit.ai key to be specified in WIT_AI_KEY environment variable")
     def test_wit_english(self):
@@ -97,5 +96,92 @@ def test_whisper_chinese(self):
         with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source)
         self.assertEqual(r.recognize_whisper(audio, model="small", language="chinese", **self.WHISPER_CONFIG), u"砸自己的腳")
 
+
+@patch("speech_recognition.urlopen")
+@patch("speech_recognition.Request")
+class RecognizeGoogleTestCase(unittest.TestCase):
+    def setUp(self) -> None:
+        self.response = MagicMock(spec=http.client.HTTPResponse)
+        self.response.read.return_value = b"""\
+{"result":[]}
+{"result":[{"alternative":[{"transcript":"one two three","confidence":0.49585345},{"transcript":"1 2","confidence":0.42899391}],"final":true}],"result_index":0}
+"""
+        # mock has AudioData's attributes (e.g. sample_rate)
+        self.audio = MagicMock(spec=sr.audio.AudioData(None, 1, 1))
+
+        self.r = sr.Recognizer()
+
+    def test_return_best_hypothesis_transcript_with_default_parameters(self, Request, urlopen):
+        urlopen.return_value = self.response
+        self.audio.sample_rate = 16_000
+
+        actual = self.r.recognize_google(self.audio)
+
+        self.assertEqual(actual, "one two three")
+        self.audio.get_flac_data.assert_called_once_with(convert_rate=None, convert_width=2)
+        Request.assert_called_once_with(
+            "http://www.google.com/speech-api/v2/recognize?client=chromium&lang=en-US&key=AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw&pFilter=0",
+            data=self.audio.get_flac_data.return_value,
+            headers={"Content-Type": "audio/x-flac; rate=16000"},
+        )
+        urlopen.assert_called_once_with(Request.return_value, timeout=None)
+        self.response.read.assert_called_once_with()
+
+    def test_minimum_sample_rate(self, Request, urlopen):
+        urlopen.return_value = self.response
+        self.audio.sample_rate = 7_999
+
+        _ = self.r.recognize_google(self.audio)
+
+        self.audio.get_flac_data.assert_called_once_with(convert_rate=8000, convert_width=2)
+
+    def test_specified_language_request(self, Request, urlopen):
+        urlopen.return_value = self.response
+        self.audio.sample_rate = 16_000
+
+        _ = self.r.recognize_google(self.audio, language="zh-CN")
+
+        Request.assert_called_once_with(
+            "http://www.google.com/speech-api/v2/recognize?client=chromium&lang=zh-CN&key=AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw&pFilter=0",
+            data=self.audio.get_flac_data.return_value,
+            headers={"Content-Type": "audio/x-flac; rate=16000"},
+        )
+
+    def test_specified_key_request(self, Request, urlopen):
+        urlopen.return_value = self.response
+        self.audio.sample_rate = 16_000
+
+        _ = self.r.recognize_google(self.audio, key="awesome-key")
+
+        Request.assert_called_once_with(
+            "http://www.google.com/speech-api/v2/recognize?client=chromium&lang=en-US&key=awesome-key&pFilter=0",
+            data=self.audio.get_flac_data.return_value,
+            headers={"Content-Type": "audio/x-flac; rate=16000"},
+        )
+
+    def test_show_all(self, Request, urlopen):
+        urlopen.return_value = self.response
+        self.audio.sample_rate = 16_000
+
+        actual = self.r.recognize_google(self.audio, show_all=True)
+
+        expected = {
+            "alternative": [
+                {"transcript": "one two three", "confidence": 0.49585345},
+                {"transcript": "1 2", "confidence": 0.42899391}
+            ],
+            "final": True
+        }
+        self.assertEqual(actual, expected)
+
+    def test_with_confidence(self, Request, urlopen):
+        urlopen.return_value = self.response
+        self.audio.sample_rate = 16_000
+
+        actual = self.r.recognize_google(self.audio, with_confidence=True)
+
+        self.assertEqual(actual, ("one two three", 0.49585345))
+
+
 if __name__ == "__main__":
     unittest.main()