Skip to content

Commit

Permalink
Merge pull request #716 from ftnext/i715-support-311
Browse files Browse the repository at this point in the history
Run tests in Python 3.11
  • Loading branch information
ftnext authored Dec 1, 2023
2 parents 8b07762 + 285ac3a commit e681993
Show file tree
Hide file tree
Showing 5 changed files with 165 additions and 47 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
strategy:
fail-fast: true
matrix:
python-version: ["3.8", "3.9", "3.10"]
python-version: ["3.8", "3.9", "3.10", "3.11"]

steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,5 +65,5 @@ def run(self):
"Topic :: Multimedia :: Sound/Audio :: Speech",
],
python_requires=">=3.8",
install_requires=['requests>=2.26.0'],
install_requires=['requests>=2.26.0', "typing-extensions"],
)
72 changes: 42 additions & 30 deletions speech_recognition/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,47 +2,52 @@

"""Library for performing speech recognition, with support for several engines and APIs, online and offline."""

import io
import os
import tempfile
import sys
import subprocess
import wave
from __future__ import annotations

import aifc
import math
import audioop
import collections
import json
import base64
import threading
import collections
import hashlib
import hmac
import io
import json
import math
import os
import subprocess
import sys
import tempfile
import threading
import time
import uuid
import wave
from typing import TYPE_CHECKING
from urllib.error import HTTPError, URLError
from urllib.parse import urlencode
from urllib.request import Request, urlopen

try:
import requests
except (ModuleNotFoundError, ImportError):
pass

__author__ = "Anthony Zhang (Uberi)"
__version__ = "3.10.0"
__license__ = "BSD"

from urllib.parse import urlencode
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError

from .audio import AudioData, get_flac_converter
from .exceptions import (
RequestError,
TranscriptionFailed,
TranscriptionFailed,
TranscriptionNotReady,
UnknownValueError,
WaitTimeoutError,
)
from .recognizers import whisper

if TYPE_CHECKING:
from .recognizers.google import Alternative, Result

__author__ = "Anthony Zhang (Uberi)"
__version__ = "3.10.0"
__license__ = "BSD"


class AudioSource(object):
def __init__(self):
Expand Down Expand Up @@ -597,7 +602,7 @@ def recognize_sphinx(self, audio_data, language="en-US", keyword_entries=None, g

# import the PocketSphinx speech recognition module
try:
from pocketsphinx import pocketsphinx, Jsgf, FsgModel
from pocketsphinx import FsgModel, Jsgf, pocketsphinx

except ImportError:
raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.")
Expand Down Expand Up @@ -716,9 +721,9 @@ def recognize_google(self, audio_data, key=None, language="en-US", pfilter=0, sh
actual_result = []
for line in response_text.split("\n"):
if not line: continue
result = json.loads(line)["result"]
result: list[Result] = json.loads(line)["result"]
if len(result) != 0:
actual_result = result[0]
actual_result: Result = result[0]
break

# return results
Expand All @@ -729,10 +734,10 @@ def recognize_google(self, audio_data, key=None, language="en-US", pfilter=0, sh

if "confidence" in actual_result["alternative"]:
# return alternative with highest confidence score
best_hypothesis = max(actual_result["alternative"], key=lambda alternative: alternative["confidence"])
best_hypothesis: Alternative = max(actual_result["alternative"], key=lambda alternative: alternative["confidence"])
else:
# when there is no confidence available, we arbitrarily choose the first hypothesis.
best_hypothesis = actual_result["alternative"][0]
best_hypothesis: Alternative = actual_result["alternative"][0]
if "transcript" not in best_hypothesis: raise UnknownValueError()
# https://cloud.google.com/speech-to-text/docs/basics#confidence-values
# "Your code should not require the confidence field as it is not guaranteed to be accurate, or even set, in any of the results."
Expand Down Expand Up @@ -763,8 +768,9 @@ def recognize_google_cloud(self, audio_data, credentials_json=None, language="en

try:
import socket
from google.cloud import speech

from google.api_core.exceptions import GoogleAPICallError
from google.cloud import speech
except ImportError:
raise RequestError('missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly.')

Expand Down Expand Up @@ -872,7 +878,9 @@ def recognize_azure(self, audio_data, key, language="en-US", profanity="masked",
access_token, expire_time = getattr(self, "azure_cached_access_token", None), getattr(self, "azure_cached_access_token_expiry", None)
allow_caching = True
try:
from time import monotonic # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+
from time import (
monotonic, # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+
)
except ImportError:
expire_time = None # monotonic time not available, don't cache access tokens
allow_caching = False # don't allow caching, since monotonic time isn't available
Expand Down Expand Up @@ -964,7 +972,9 @@ def recognize_bing(self, audio_data, key, language="en-US", show_all=False):
access_token, expire_time = getattr(self, "bing_cached_access_token", None), getattr(self, "bing_cached_access_token_expiry", None)
allow_caching = True
try:
from time import monotonic # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+
from time import (
monotonic, # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+
)
except ImportError:
expire_time = None # monotonic time not available, don't cache access tokens
allow_caching = False # don't allow caching, since monotonic time isn't available
Expand Down Expand Up @@ -1130,9 +1140,10 @@ def recognize_amazon(self, audio_data, bucket_name=None, access_key_id=None, sec
assert access_key_id is None or isinstance(access_key_id, str), "``access_key_id`` must be a string"
assert secret_access_key is None or isinstance(secret_access_key, str), "``secret_access_key`` must be a string"
assert region is None or isinstance(region, str), "``region`` must be a string"
import multiprocessing
import traceback
import uuid
import multiprocessing

from botocore.exceptions import ClientError
proc = multiprocessing.current_process()

Expand Down Expand Up @@ -1208,7 +1219,8 @@ def recognize_amazon(self, audio_data, bucket_name=None, access_key_id=None, sec

# Retrieve transcription JSON containing transcript.
transcript_uri = job['Transcript']['TranscriptFileUri']
import urllib.request, json
import json
import urllib.request
with urllib.request.urlopen(transcript_uri) as json_data:
d = json.load(json_data)
confidences = []
Expand Down Expand Up @@ -1504,7 +1516,7 @@ def recognize_whisper(self, audio_data, model="base", show_dict=False, load_opti
recognize_whisper_api = whisper.recognize_whisper_api

def recognize_vosk(self, audio_data, language='en'):
from vosk import Model, KaldiRecognizer
from vosk import KaldiRecognizer, Model

assert isinstance(audio_data, AudioData), "Data must be audio data"

Expand Down
20 changes: 20 additions & 0 deletions speech_recognition/recognizers/google.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from __future__ import annotations

from typing import TypedDict

from typing_extensions import NotRequired


class Alternative(TypedDict):
transcript: str
confidence: float


class Result(TypedDict):
alternative: list[Alternative]
final: bool


class GoogleResponse(TypedDict):
result: list[Result]
result_index: NotRequired[int]
116 changes: 101 additions & 15 deletions tests/test_recognition.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import http
import os
import unittest
from unittest.mock import patch, MagicMock

import speech_recognition as sr

Expand All @@ -14,25 +16,22 @@ def setUp(self):
self.AUDIO_FILE_ZH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "chinese.flac")
self.WHISPER_CONFIG = {"temperature": 0}

def test_sphinx_english(self):
def test_recognizer_attributes(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
self.assertEqual(r.recognize_sphinx(audio), "one two three")

def test_google_english(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
self.assertIn(r.recognize_google(audio), ["123", "1 2 3", "one two three"])
self.assertEqual(r.energy_threshold, 300)
self.assertTrue(r.dynamic_energy_threshold)
self.assertEqual(r.dynamic_energy_adjustment_damping, 0.15)
self.assertEqual(r.dynamic_energy_ratio, 1.5)
self.assertEqual(r.pause_threshold, 0.8)
self.assertIsNone(r.operation_timeout)
self.assertEqual(r.phrase_threshold, 0.3)
self.assertEqual(r.non_speaking_duration, 0.5)

def test_google_french(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_FR) as source: audio = r.record(source)
self.assertEqual(r.recognize_google(audio, language="fr-FR"), u"et c'est la dictée numéro 1")

def test_google_chinese(self):
def test_sphinx_english(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source)
self.assertEqual(r.recognize_google(audio, language="zh-CN"), u"砸自己的脚")
with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
self.assertEqual(r.recognize_sphinx(audio), "one two three")

@unittest.skipUnless("WIT_AI_KEY" in os.environ, "requires Wit.ai key to be specified in WIT_AI_KEY environment variable")
def test_wit_english(self):
Expand Down Expand Up @@ -97,5 +96,92 @@ def test_whisper_chinese(self):
with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source)
self.assertEqual(r.recognize_whisper(audio, model="small", language="chinese", **self.WHISPER_CONFIG), u"砸自己的腳")


@patch("speech_recognition.urlopen")
@patch("speech_recognition.Request")
class RecognizeGoogleTestCase(unittest.TestCase):
def setUp(self) -> None:
self.response = MagicMock(spec=http.client.HTTPResponse)
self.response.read.return_value = b"""\
{"result":[]}
{"result":[{"alternative":[{"transcript":"one two three","confidence":0.49585345},{"transcript":"1 2","confidence":0.42899391}],"final":true}],"result_index":0}
"""
# mock has AudioData's attributes (e.g. sample_rate)
self.audio = MagicMock(spec=sr.audio.AudioData(None, 1, 1))

self.r = sr.Recognizer()

def test_return_best_hypothesis_transcript_with_default_parameters(self, Request, urlopen):
urlopen.return_value = self.response
self.audio.sample_rate = 16_000

actual = self.r.recognize_google(self.audio)

self.assertEqual(actual, "one two three")
self.audio.get_flac_data.assert_called_once_with(convert_rate=None, convert_width=2)
Request.assert_called_once_with(
"http://www.google.com/speech-api/v2/recognize?client=chromium&lang=en-US&key=AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw&pFilter=0",
data=self.audio.get_flac_data.return_value,
headers={"Content-Type": "audio/x-flac; rate=16000"},
)
urlopen.assert_called_once_with(Request.return_value, timeout=None)
self.response.read.assert_called_once_with()

def test_minimum_sample_rate(self, Request, urlopen):
urlopen.return_value = self.response
self.audio.sample_rate = 7_999

_ = self.r.recognize_google(self.audio)

self.audio.get_flac_data.assert_called_once_with(convert_rate=8000, convert_width=2)

def test_specified_language_request(self, Request, urlopen):
urlopen.return_value = self.response
self.audio.sample_rate = 16_000

_ = self.r.recognize_google(self.audio, language="zh-CN")

Request.assert_called_once_with(
"http://www.google.com/speech-api/v2/recognize?client=chromium&lang=zh-CN&key=AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw&pFilter=0",
data=self.audio.get_flac_data.return_value,
headers={"Content-Type": "audio/x-flac; rate=16000"},
)

def test_specified_key_request(self, Request, urlopen):
urlopen.return_value = self.response
self.audio.sample_rate = 16_000

_ = self.r.recognize_google(self.audio, key="awesome-key")

Request.assert_called_once_with(
"http://www.google.com/speech-api/v2/recognize?client=chromium&lang=en-US&key=awesome-key&pFilter=0",
data=self.audio.get_flac_data.return_value,
headers={"Content-Type": "audio/x-flac; rate=16000"},
)

def test_show_all(self, Request, urlopen):
urlopen.return_value = self.response
self.audio.sample_rate = 16_000

actual = self.r.recognize_google(self.audio, show_all=True)

expected = {
"alternative": [
{"transcript": "one two three", "confidence": 0.49585345},
{"transcript": "1 2", "confidence": 0.42899391}
],
"final": True
}
self.assertEqual(actual, expected)

def test_with_confidence(self, Request, urlopen):
urlopen.return_value = self.response
self.audio.sample_rate = 16_000

actual = self.r.recognize_google(self.audio, with_confidence=True)

self.assertEqual(actual, ("one two three", 0.49585345))


if __name__ == "__main__":
unittest.main()

0 comments on commit e681993

Please sign in to comment.