diff --git a/README.rst b/README.rst index da70104d..7bb0c0cf 100644 --- a/README.rst +++ b/README.rst @@ -160,8 +160,9 @@ You can install it with :command:`python3 -m pip install SpeechRecognition[googl **Prerequisite**: Create local authentication credentials for your Google account -* `Before you begin (Transcribe speech to text by using client libraries) `__ -* Detail: `User credentials (Set up ADC for a local development environment) `__ +* Digest: `Before you begin (Transcribe speech to text by using client libraries) `__ +* `Set up Speech-to-Text `__ +* `User credentials (Set up ADC for a local development environment) `__ Currently only `V1 `__ is supported. (`V2 `__ is not supported) diff --git a/examples/audio_transcribe.py b/examples/audio_transcribe.py index 7806023f..ec14614d 100644 --- a/examples/audio_transcribe.py +++ b/examples/audio_transcribe.py @@ -33,9 +33,9 @@ print("Could not request results from Google Speech Recognition service; {0}".format(e)) # recognize speech using Google Cloud Speech -GOOGLE_CLOUD_SPEECH_CREDENTIALS = r"""INSERT THE CONTENTS OF THE GOOGLE CLOUD SPEECH JSON CREDENTIALS FILE HERE""" +# Before run, create local authentication credentials (``gcloud auth application-default login``) try: - print("Google Cloud Speech thinks you said " + r.recognize_google_cloud(audio, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS)) + print("Google Cloud Speech thinks you said " + r.recognize_google_cloud(audio)) except sr.UnknownValueError: print("Google Cloud Speech could not understand audio") except sr.RequestError as e: diff --git a/examples/extended_results.py b/examples/extended_results.py index 599c67f2..f65061ed 100644 --- a/examples/extended_results.py +++ b/examples/extended_results.py @@ -37,10 +37,10 @@ print("Could not request results from Google Speech Recognition service; {0}".format(e)) # recognize speech using Google Cloud Speech -GOOGLE_CLOUD_SPEECH_CREDENTIALS = r"""INSERT THE CONTENTS OF THE GOOGLE CLOUD SPEECH JSON CREDENTIALS FILE HERE""" +# Before run, create local authentication credentials (``gcloud auth application-default login``) try: print("Google Cloud Speech recognition results:") - pprint(r.recognize_google_cloud(audio, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS, show_all=True)) # pretty-print the recognition result + pprint(r.recognize_google_cloud(audio, show_all=True)) # pretty-print the recognition result except sr.UnknownValueError: print("Google Cloud Speech could not understand audio") except sr.RequestError as e: diff --git a/examples/microphone_recognition.py b/examples/microphone_recognition.py index a4f10a9b..e864e2a4 100644 --- a/examples/microphone_recognition.py +++ b/examples/microphone_recognition.py @@ -32,9 +32,9 @@ print("Could not request results from Google Speech Recognition service; {0}".format(e)) # recognize speech using Google Cloud Speech -GOOGLE_CLOUD_SPEECH_CREDENTIALS = r"""INSERT THE CONTENTS OF THE GOOGLE CLOUD SPEECH JSON CREDENTIALS FILE HERE""" +# Before run, create local authentication credentials (``gcloud auth application-default login``) try: - print("Google Cloud Speech thinks you said " + r.recognize_google_cloud(audio, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS)) + print("Google Cloud Speech thinks you said " + r.recognize_google_cloud(audio)) except sr.UnknownValueError: print("Google Cloud Speech could not understand audio") except sr.RequestError as e: diff --git a/examples/special_recognizer_features.py b/examples/special_recognizer_features.py index f4365297..1d051ede 100644 --- a/examples/special_recognizer_features.py +++ b/examples/special_recognizer_features.py @@ -35,11 +35,11 @@ # recognize preferred phrases using Google Cloud Speech -GOOGLE_CLOUD_SPEECH_CREDENTIALS = r"""INSERT THE CONTENTS OF THE GOOGLE CLOUD SPEECH JSON CREDENTIALS FILE HERE""" +# Before run, create local authentication credentials (``gcloud auth application-default login``) try: print("Google Cloud Speech recognition for \"numero\" with different sets of preferred phrases:") - print(r.recognize_google_cloud(audio_fr, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS, preferred_phrases=["noomarow"])) - print(r.recognize_google_cloud(audio_fr, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS, preferred_phrases=["newmarrow"])) + print(r.recognize_google_cloud(audio_fr, preferred_phrases=["noomarow"])) + print(r.recognize_google_cloud(audio_fr, preferred_phrases=["newmarrow"])) except sr.UnknownValueError: print("Google Cloud Speech could not understand audio") except sr.RequestError as e: diff --git a/reference/library-reference.rst b/reference/library-reference.rst index 82239fd2..61370837 100644 --- a/reference/library-reference.rst +++ b/reference/library-reference.rst @@ -227,8 +227,8 @@ Returns the most likely transcription if ``show_all`` is false (the default). Ot Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. -``recognizer_instance.recognize_google_cloud(audio_data: AudioData, credentials_json_path: Union[str, None] = None, language: str = "en-US", preferred_phrases: Union[Iterable[str], None] = None, show_all: bool = False, **api_params) -> Union[str, Dict[str, Any]]`` ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +``recognizer_instance.recognize_google_cloud(audio_data: AudioData, credentials_json_path: Union[str, None] = None, **kwargs) -> Union[str, Dict[str, Any]]`` +------------------------------------------------------------------------------------------------------------------------------------------------------------- .. autofunction:: speech_recognition.recognizers.google_cloud.recognize diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py index 23c1b4e5..5c5a7f62 100644 --- a/speech_recognition/recognizers/google_cloud.py +++ b/speech_recognition/recognizers/google_cloud.py @@ -1,52 +1,99 @@ from __future__ import annotations +from typing import TYPE_CHECKING, TypedDict from urllib.error import URLError from speech_recognition.audio import AudioData from speech_recognition.exceptions import RequestError, UnknownValueError +if TYPE_CHECKING: + from google.cloud.speech import ( + RecognitionConfig, + RecognizeResponse, + SpeechContext, + ) + from typing_extensions import Required -def recognize( - recognizer, - audio_data: AudioData, - credentials_json_path: str | None = None, - language: str = "en-US", - preferred_phrases=None, - show_all: bool = False, - **api_params, -): - """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API. - This function requires a Google Cloud Platform account; see the `Google Cloud Speech API Quickstart `__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project, and set up Service Account Key credentials for the project. The result is a JSON file containing the API credentials. The text content of this JSON file is specified by ``credentials_json``. If not specified, the library will try to automatically `find the default API credentials JSON file `__. +class GoogleCloudRecognizerParameters(TypedDict, total=False): + """Optional parameters. + + The recognition language is determined by ``language_code``, which is a BCP-47 language tag like ``"en-US"`` (US English). Default: ``"en-US"``. + A list of supported language tags can be found in the `Speech-to-Text supported languages `__. + + If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. + This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. + Note that the API imposes certain `restrictions on the list of phrase strings `__. + + ``show_all``: See :py:func:`recognize`. + + ``model``: You can select the model to get best results. (See `RecognitionConfig's documentation `__ for detail) - The recognition language is determined by ``language``, which is a BCP-47 language tag like ``"en-US"`` (US English). A list of supported language tags can be found in the `Google Cloud Speech API documentation `__. + ``use_enhanced``: Set to true to use an enhanced model for speech recognition. + """ + + # SpeechRecognition specific parameters + preferred_phrases: list[str] + show_all: bool + + # Speech-to-Text V1 API's parameters + language_code: str + model: str + use_enhanced: bool + # TODO Add others support + + +class GoogleCloudSpeechV1Parameters(TypedDict, total=False): + """Speech-to-Text V1 API's parameters. - If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. Note that the API imposes certain `restrictions on the list of phrase strings `__. + https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig + """ - ``api_params`` are Cloud Speech API-specific parameters as dict (optional). For more information see + encoding: Required[RecognitionConfig.AudioEncoding] + sample_rate_hertz: Required[int] + language_code: Required[str] + speech_contexts: list[SpeechContext] + enable_word_time_offsets: bool + model: str + use_enhanced: bool - The ``use_enhanced`` is a boolean option. If use_enhanced is set to true and the model field is not set, - then an appropriate enhanced model is chosen if an enhanced model exists for the audio. - If use_enhanced is true and an enhanced version of the specified model does not exist, - then the speech is recognized using the standard version of the specified model. - Furthermore, if the option ``use_enhanced`` has not been set the option ``model`` can be used, which can be used to select the model best - suited to your domain to get best results. If a model is not explicitly specified, - then we auto-select a model based on the other parameters of this method. +def _build_config( + audio_data: AudioData, recognizer_params: GoogleCloudRecognizerParameters +) -> RecognitionConfig: + from google.cloud import speech + + parameters: GoogleCloudSpeechV1Parameters = { + "encoding": speech.RecognitionConfig.AudioEncoding.FLAC, + "sample_rate_hertz": audio_data.sample_rate, + "language_code": recognizer_params.pop("language_code", "en-US"), + } + if preferred_phrases := recognizer_params.pop("preferred_phrases", None): + parameters["speech_contexts"] = [ + speech.SpeechContext(phrases=preferred_phrases) + ] + if recognizer_params.pop("show_all", False): + # ref: https://cloud.google.com/speech-to-text/docs/async-time-offsets + parameters["enable_word_time_offsets"] = True + return speech.RecognitionConfig(**(parameters | recognizer_params)) + + +def recognize( + recognizer, + audio_data: AudioData, + credentials_json_path: str | None = None, + **kwargs: GoogleCloudRecognizerParameters, +) -> str | RecognizeResponse: + """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API. + + This function requires a Google Cloud Platform account; see the `Set up Speech-to-Text `__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project. + And create local authentication credentials for your user account. The result is a JSON file containing the API credentials. You can specify the JSON file by ``credentials_json_path``. If not specified, the library will try to automatically `find the default API credentials JSON file `__. Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary. + For other parameters, see :py:class:`GoogleCloudRecognizerParameters`. Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection. """ - assert isinstance( - audio_data, AudioData - ), "``audio_data`` must be audio data" - assert isinstance(language, str), "``language`` must be a string" - assert preferred_phrases is None or all( - isinstance(preferred_phrases, (type(""), type(""))) - for preferred_phrases in preferred_phrases - ), "``preferred_phrases`` must be a list of strings" - try: from google.api_core.exceptions import GoogleAPICallError from google.cloud import speech @@ -72,21 +119,7 @@ def recognize( ) audio = speech.RecognitionAudio(content=flac_data) - config = { - "encoding": speech.RecognitionConfig.AudioEncoding.FLAC, - "sample_rate_hertz": audio_data.sample_rate, - "language_code": language, - **api_params, - } - if preferred_phrases is not None: - config["speechContexts"] = [ - speech.SpeechContext(phrases=preferred_phrases) - ] - if show_all: - # ref: https://cloud.google.com/speech-to-text/docs/async-time-offsets - config["enable_word_time_offsets"] = True - - config = speech.RecognitionConfig(**config) + config = _build_config(audio_data, kwargs.copy()) try: response = client.recognize(config=config, audio=audio) @@ -97,7 +130,7 @@ def recognize( "recognition connection failed: {0}".format(e.reason) ) - if show_all: + if kwargs.get("show_all"): return response if len(response.results) == 0: raise UnknownValueError() diff --git a/tests/recognizers/test_google_cloud.py b/tests/recognizers/test_google_cloud.py index c450d14f..cca80aec 100644 --- a/tests/recognizers/test_google_cloud.py +++ b/tests/recognizers/test_google_cloud.py @@ -4,6 +4,7 @@ RecognitionAudio, RecognitionConfig, RecognizeResponse, + SpeechContext, SpeechRecognitionAlternative, SpeechRecognitionResult, WordInfo, @@ -164,7 +165,8 @@ def test_transcribe_with_specified_api_parameters(SpeechClient): _ = recognize( MagicMock(spec=Recognizer), audio_data, - language="ja-JP", + language_code="ja-JP", + preferred_phrases=["numero", "hoge"], use_enhanced=True, ) @@ -173,6 +175,7 @@ def test_transcribe_with_specified_api_parameters(SpeechClient): encoding=RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16_000, language_code="ja-JP", + speech_contexts=[SpeechContext(phrases=["numero", "hoge"])], use_enhanced=True, ), audio=RecognitionAudio(content=b"flac_data"),