From bc31b966bcdef3f3035884e3661dba0dc91bbac0 Mon Sep 17 00:00:00 2001 From: Thomas Schultz Date: Mon, 3 Oct 2016 14:52:49 -0400 Subject: [PATCH 1/6] Add Speech Streaming API. --- docs/index.rst | 1 + docs/speech-streaming.rst | 33 ++++ docs/speech-usage.rst | 77 ++++++++ scripts/verify_included_modules.py | 1 + speech/google/cloud/speech/client.py | 125 +++++++++++- speech/google/cloud/speech/operation.py | 3 +- speech/google/cloud/speech/sample.py | 32 ++- .../google/cloud/speech/streaming/__init__.py | 0 .../cloud/speech/streaming/container.py | 72 +++++++ .../google/cloud/speech/streaming/request.py | 183 ++++++++++++++++++ .../google/cloud/speech/streaming/response.py | 77 ++++++++ .../google/cloud/speech/streaming/result.py | 73 +++++++ speech/google/cloud/speech/transcript.py | 28 ++- speech/setup.py | 1 + speech/unit_tests/streaming/__init__.py | 13 ++ speech/unit_tests/streaming/test_container.py | 124 ++++++++++++ speech/unit_tests/streaming/test_request.py | 50 +++++ speech/unit_tests/streaming/test_response.py | 29 +++ speech/unit_tests/test_client.py | 78 ++++++++ speech/unit_tests/test_transcript.py | 3 +- 20 files changed, 987 insertions(+), 16 deletions(-) create mode 100644 docs/speech-streaming.rst create mode 100644 speech/google/cloud/speech/streaming/__init__.py create mode 100644 speech/google/cloud/speech/streaming/container.py create mode 100644 speech/google/cloud/speech/streaming/request.py create mode 100644 speech/google/cloud/speech/streaming/response.py create mode 100644 speech/google/cloud/speech/streaming/result.py create mode 100644 speech/unit_tests/streaming/__init__.py create mode 100644 speech/unit_tests/streaming/test_container.py create mode 100644 speech/unit_tests/streaming/test_request.py create mode 100644 speech/unit_tests/streaming/test_response.py diff --git a/docs/index.rst b/docs/index.rst index 7a888a1c9cb9..902c0ad80eee 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -177,6 +177,7 @@ speech-encoding speech-metadata speech-operation + speech-streaming speech-sample speech-transcript diff --git a/docs/speech-streaming.rst b/docs/speech-streaming.rst new file mode 100644 index 000000000000..6fa42dbd9f79 --- /dev/null +++ b/docs/speech-streaming.rst @@ -0,0 +1,33 @@ +Speech StreamingResponseContainer +================================= + +.. automodule:: google.cloud.speech.streaming.container + :members: + :undoc-members: + :show-inheritance: + +Speech Streaming Request helpers +================================ + +.. automodule:: google.cloud.speech.streaming.request + :members: + :undoc-members: + :show-inheritance: + +Speech StreamingSpeechResponse +============================== + +.. automodule:: google.cloud.speech.streaming.response + :members: + :undoc-members: + :show-inheritance: + + + +Speech StreamingSpeechResult +============================ + +.. automodule:: google.cloud.speech.streaming.result + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/speech-usage.rst b/docs/speech-usage.rst index e3341051d128..6161f1925d09 100644 --- a/docs/speech-usage.rst +++ b/docs/speech-usage.rst @@ -145,5 +145,82 @@ words to the vocabulary of the recognizer. transcript: Hello, this is a test confidence: 0.81 + +Streaming Recognition +--------------------- + +The :meth:`~google.cloud.speech.Client.stream_recognize` method converts speech +data to possible text alternatives on the fly. + +.. note:: + Streaming recognition requests are limited to 1 minute of audio. + + See: https://cloud.google.com/speech/limits#content + + >>> import io + >>> from google.cloud import speech + >>> from google.cloud.speech.encoding import Encoding + >>> client = speech.Client() + >>> with io.open('./hello.wav', 'rb') as stream: + >>> sample = client.sample(stream=stream, encoding=Encoding.LINEAR16, + ... sample_rate=16000) + >>> stream_container = client.stream_recognize(sample) + >>> print(stream_container) + + >>> print(stream_container.responses) + {0: } + >>> print(stream_container.responses[0].results[0].alternatives[0].confidence) + 0.698092460632 + >>> print(stream_container.is_finished) + True + >>> print stream_container.get_full_text() + hello + +By default the recognizer will perform continuous recognition +(continuing to process audio even if the user pauses speaking) until the client +closes the output stream or when the maximum time limit has been reached. + +If you only want to recognize a single utterance you can set + ``single_utterance`` to ``True`` and only one result will be returned. + +See: `Single Utterance`_ + +.. code-block:: python + + >>> with io.open('./hello_pause_goodbye.wav', 'rb') as stream: + >>> sample = client.sample(stream=stream, encoding=Encoding.LINEAR16, + ... sample_rate=16000) + >>> stream_container = client.stream_recognize(sample, + ... single_utterance=True) + >>> print(stream_container.get_full_text()) + hello + + +If ``interim_results`` is set to ``True``, interim results +(tentative hypotheses) may be returned as they become available. + + .. code-block:: python + + >>> with io.open('./hello_pause_goodbye.wav', 'rb') as stream: + >>> sample = client.sample(stream=stream, encoding=Encoding.LINEAR16, + ... sample_rate=16000) + >>> stream_container = client.stream_recognize(sample, + ... interim_results=True) + >>> print(stream_container.get_full_text()) + hello + + >>> sample = client.sample(source_uri='gs://my-bucket/recording.flac', + ... encoding=Encoding.FLAC, + ... sample_rate=44100) + >>> results = client.stream_recognize(sample, interim_results=True) + >>> print(stream_container.responses[0].results[0].alternatives[0].transcript) + how + print(stream_container.responses[1].results[0].alternatives[0].transcript) + hello + >>> print(stream_container.responses[1].results[2].is_final) + True + + +.. _Single Utterance: https://cloud.google.com/speech/reference/rpc/google.cloud.speech.v1beta1#streamingrecognitionconfig .. _sync_recognize: https://cloud.google.com/speech/reference/rest/v1beta1/speech/syncrecognize .. _Speech Asynchronous Recognize: https://cloud.google.com/speech/reference/rest/v1beta1/speech/asyncrecognize diff --git a/scripts/verify_included_modules.py b/scripts/verify_included_modules.py index d0791f0807b0..9e7f2963b99d 100644 --- a/scripts/verify_included_modules.py +++ b/scripts/verify_included_modules.py @@ -44,6 +44,7 @@ 'google.cloud.pubsub.__init__', 'google.cloud.resource_manager.__init__', 'google.cloud.speech.__init__', + 'google.cloud.speech.streaming.__init__', 'google.cloud.storage.__init__', 'google.cloud.streaming.__init__', 'google.cloud.streaming.buffered_stream', diff --git a/speech/google/cloud/speech/client.py b/speech/google/cloud/speech/client.py index 553927d237cd..c9598d16cee4 100644 --- a/speech/google/cloud/speech/client.py +++ b/speech/google/cloud/speech/client.py @@ -14,15 +14,30 @@ """Basic client for Google Cloud Speech API.""" +import os from base64 import b64encode from google.cloud._helpers import _to_bytes from google.cloud._helpers import _bytes_to_unicode from google.cloud import client as client_module +from google.cloud.environment_vars import DISABLE_GRPC from google.cloud.speech.connection import Connection from google.cloud.speech.encoding import Encoding from google.cloud.speech.operation import Operation +from google.cloud.speech.streaming.request import _make_request_stream from google.cloud.speech.sample import Sample +from google.cloud.speech.streaming.container import StreamingResponseContainer + +try: + from google.cloud.gapic.speech.v1beta1.speech_api import SpeechApi +except ImportError: # pragma: NO COVER + _HAVE_GAX = False +else: + _HAVE_GAX = True + + +_DISABLE_GAX = os.getenv(DISABLE_GRPC, False) +_USE_GAX = _HAVE_GAX and not _DISABLE_GAX class Client(client_module.Client): @@ -47,6 +62,7 @@ class Client(client_module.Client): """ _connection_class = Connection + _speech_api = None def async_recognize(self, sample, language_code=None, max_alternatives=None, profanity_filter=None, @@ -104,7 +120,7 @@ def async_recognize(self, sample, language_code=None, return Operation.from_api_repr(self, api_response) @staticmethod - def sample(content=None, source_uri=None, encoding=None, + def sample(content=None, source_uri=None, stream=None, encoding=None, sample_rate=None): """Factory: construct Sample to use when making recognize requests. @@ -118,6 +134,9 @@ def sample(content=None, source_uri=None, encoding=None, supported, which must be specified in the following format: ``gs://bucket_name/object_name``. + :type stream: :class:`io.BufferedReader` + :param stream: File like object to read audio data from. + :type encoding: str :param encoding: encoding of audio data sent in all RecognitionAudio messages, can be one of: :attr:`~.Encoding.LINEAR16`, @@ -135,7 +154,7 @@ def sample(content=None, source_uri=None, encoding=None, :rtype: :class:`~google.cloud.speech.sample.Sample` :returns: Instance of ``Sample``. """ - return Sample(content=content, source_uri=source_uri, + return Sample(content=content, source_uri=source_uri, stream=stream, encoding=encoding, sample_rate=sample_rate) def sync_recognize(self, sample, language_code=None, @@ -199,6 +218,108 @@ def sync_recognize(self, sample, language_code=None, else: raise ValueError('result in api should have length 1') + def stream_recognize(self, sample, language_code=None, + max_alternatives=None, profanity_filter=None, + speech_context=None, single_utterance=False, + interim_results=False): + """Streaming speech recognition. + + .. note:: + Streaming recognition requests are limited to 1 minute of audio. + + See: https://cloud.google.com/speech/limits#content + + :type sample: :class:`~google.cloud.speech.sample.Sample` + :param sample: Instance of ``Sample`` containing audio information. + + :type language_code: str + :param language_code: (Optional) The language of the supplied audio as + BCP-47 language tag. Example: ``'en-GB'``. + If omitted, defaults to ``'en-US'``. + + :type max_alternatives: int + :param max_alternatives: (Optional) Maximum number of recognition + hypotheses to be returned. The server may + return fewer than maxAlternatives. + Valid values are 0-30. A value of 0 or 1 + will return a maximum of 1. Defaults to 1 + + :type profanity_filter: bool + :param profanity_filter: If True, the server will attempt to filter + out profanities, replacing all but the + initial character in each filtered word with + asterisks, e.g. ``'f***'``. If False or + omitted, profanities won't be filtered out. + + :type speech_context: list + :param speech_context: A list of strings (max 50) containing words and + phrases "hints" so that the speech recognition + is more likely to recognize them. This can be + used to improve the accuracy for specific words + and phrases. This can also be used to add new + words to the vocabulary of the recognizer. + + :type single_utterance: boolean + :param single_utterance: [Optional] If false or omitted, the recognizer + will perform continuous recognition + (continuing to process audio even if the user + pauses speaking) until the client closes the + output stream (gRPC API) or when the maximum + time limit has been reached. Multiple + SpeechRecognitionResults with the is_final + flag set to true may be returned. + + If true, the recognizer will detect a single + spoken utterance. When it detects that the + user has paused or stopped speaking, it will + return an END_OF_UTTERANCE event and cease + recognition. It will return no more than one + SpeechRecognitionResult with the is_final flag + set to true. + + :type interim_results: boolean + :param interim_results: [Optional] If true, interim results (tentative + hypotheses) may be returned as they become + available (these interim results are indicated + with the is_final=false flag). If false or + omitted, only is_final=true result(s) are + returned. + + :rtype: :class:`~streaming.StreamingResponseContainer` + :returns: An instance of ``StreamingReponseContainer``. + + """ + if not _USE_GAX: + raise EnvironmentError('GRPC is required to use this API.') + + if sample.stream.closed: + raise ValueError('Stream is closed.') + + requests = _make_request_stream(sample, language_code=language_code, + max_alternatives=max_alternatives, + profanity_filter=profanity_filter, + speech_context=speech_context, + single_utterance=single_utterance, + interim_results=interim_results) + + responses = StreamingResponseContainer() + for response in self.speech_api.streaming_recognize(requests): + if response: + responses.add_response(response) + + return responses + + @property + def speech_api(self): + """Instance of Speech API. + + :rtype: :class:`google.cloud.gapic.speech.v1beta1.speech_api.SpeechApi` + :returns: Instance of ``SpeechApi``. + """ + if not self._speech_api: + self._speech_api = SpeechApi() + return self._speech_api + def _build_request_data(sample, language_code=None, max_alternatives=None, profanity_filter=None, speech_context=None): diff --git a/speech/google/cloud/speech/operation.py b/speech/google/cloud/speech/operation.py index 69614b16cb7f..e7abbf88636d 100644 --- a/speech/google/cloud/speech/operation.py +++ b/speech/google/cloud/speech/operation.py @@ -124,7 +124,8 @@ def _update(self, response): results = [] if raw_results: for result in raw_results[0]['alternatives']: - results.append(Transcript(result)) + results.append(Transcript(result.get('transcript'), + result.get('confidence'))) if metadata: self._metadata = Metadata.from_api_repr(metadata) diff --git a/speech/google/cloud/speech/sample.py b/speech/google/cloud/speech/sample.py index a197f20372f6..5b1608f80e79 100644 --- a/speech/google/cloud/speech/sample.py +++ b/speech/google/cloud/speech/sample.py @@ -30,6 +30,9 @@ class Sample(object): supported, which must be specified in the following format: ``gs://bucket_name/object_name``. + :type stream: :class:`io.BufferedReader` + :param stream: File like object to read audio data from. + :type encoding: str :param encoding: encoding of audio data sent in all RecognitionAudio messages, can be one of: :attr:`~.Encoding.LINEAR16`, @@ -47,16 +50,15 @@ class Sample(object): default_encoding = Encoding.FLAC default_sample_rate = 16000 - def __init__(self, content=None, source_uri=None, + def __init__(self, content=None, source_uri=None, stream=None, encoding=None, sample_rate=None): - - no_source = content is None and source_uri is None - both_source = content is not None and source_uri is not None - if no_source or both_source: - raise ValueError('Supply one of \'content\' or \'source_uri\'') + if [content, source_uri, stream].count(None) != 2: + raise ValueError('Supply only one of \'content\', \'source_uri\'' + ' or stream.') self._content = content self._source_uri = source_uri + self._stream = stream if sample_rate is not None and not 8000 <= sample_rate <= 48000: raise ValueError('The value of sample_rate must be between 8000' @@ -68,6 +70,15 @@ def __init__(self, content=None, source_uri=None, else: raise ValueError('Invalid encoding: %s' % (encoding,)) + @property + def chunk_size(self): + """Chunk size to send over GRPC. ~100ms + + :rtype: int + :returns: Optimized chunk size. + """ + return int(self.sample_rate / 10) + @property def source_uri(self): """Google Cloud Storage URI of audio source. @@ -77,6 +88,15 @@ def source_uri(self): """ return self._source_uri + @property + def stream(self): + """Stream of audio data. + + :rtype: :class:`io.BufferedReader` + :returns: File like object to read audio data from. + """ + return self._stream + @property def content(self): """Bytes of audio content. diff --git a/speech/google/cloud/speech/streaming/__init__.py b/speech/google/cloud/speech/streaming/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/speech/google/cloud/speech/streaming/container.py b/speech/google/cloud/speech/streaming/container.py new file mode 100644 index 000000000000..52384b9597a2 --- /dev/null +++ b/speech/google/cloud/speech/streaming/container.py @@ -0,0 +1,72 @@ +# Copyright 2016 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Representation of a group of GAPIC Speech API responses.""" + +from google.cloud.speech.streaming.response import StreamingSpeechResponse + + +class StreamingResponseContainer(object): + """Response container to help manage streaming responses. + + :type responses: list of :class:`~response.StreamingSpeechResponse` + :param responses: List of ``StreamingSpeechResponse`` objects. + """ + def __init__(self, responses=None): + self._responses = responses or {} + + def add_response(self, response): + """Add/update response based on the ``result_index``. + + :type response: :class:`~response.StreamingSpeechResponse` + :param response: Instance of ``StreamingSpeechResponse``. + """ + self._responses.update({response.result_index: + StreamingSpeechResponse.from_pb(response)}) + + @property + def responses(self): + """All responses held in container. + + :rtype: list of :class:`~response.StreamingSpeechResponse` + :returns: List of ``StreamingSpeechResponse`` objects. + """ + return self._responses + + @property + def is_finished(self): + """Helper property to determin if all resuls are final. + + :rtype: bool + :returns: True of all captured results are final. + """ + finished = [] + for response in self.responses.values(): + for result in response.results: + finished.append(result.is_final) + return all(finished) + + def get_full_text(self): + """Parse together all transcript results to form complete text. + + :rtype: str + :returns: Complete transcription. + """ + text = None + if self.is_finished: + text = '' + for response in self.responses.values(): + for result in response.results: + text += result.alternatives[0].transcript + return text diff --git a/speech/google/cloud/speech/streaming/request.py b/speech/google/cloud/speech/streaming/request.py new file mode 100644 index 000000000000..794e3f992a67 --- /dev/null +++ b/speech/google/cloud/speech/streaming/request.py @@ -0,0 +1,183 @@ +# Copyright 2016 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helper to make Speech requests from IO stream""" + +from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import RecognitionConfig +from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + StreamingRecognitionConfig) +from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + StreamingRecognizeRequest) + + +def _make_request_stream(sample, language_code=None, max_alternatives=None, + profanity_filter=None, speech_context=None, + single_utterance=None, interim_results=None): + """Generate stream of requests from sample. + + :type sample: :class:`~google.cloud.speech.sample.Sample` + :param sample: Instance of ``Sample`` containing audio information. + + :type language_code: str + :param language_code: (Optional) The language of the supplied audio as + BCP-47 language tag. Example: ``'en-GB'``. + If omitted, defaults to ``'en-US'``. + + :type max_alternatives: int + :param max_alternatives: (Optional) Maximum number of recognition + hypotheses to be returned. The server may + return fewer than maxAlternatives. + Valid values are 0-30. A value of 0 or 1 + will return a maximum of 1. Defaults to 1 + + :type profanity_filter: bool + :param profanity_filter: If True, the server will attempt to filter + out profanities, replacing all but the + initial character in each filtered word with + asterisks, e.g. ``'f***'``. If False or + omitted, profanities won't be filtered out. + + :type speech_context: list + :param speech_context: A list of strings (max 50) containing words and + phrases "hints" so that the speech recognition + is more likely to recognize them. This can be + used to improve the accuracy for specific words + and phrases. This can also be used to add new + words to the vocabulary of the recognizer. + + :type single_utterance: boolean + :param single_utterance: [Optional] If false or omitted, the recognizer + will perform continuous recognition + (continuing to process audio even if the user + pauses speaking) until the client closes the + output stream (gRPC API) or when the maximum + time limit has been reached. Multiple + SpeechRecognitionResults with the is_final + flag set to true may be returned. + + If true, the recognizer will detect a single + spoken utterance. When it detects that the + user has paused or stopped speaking, it will + return an END_OF_UTTERANCE event and cease + recognition. It will return no more than one + SpeechRecognitionResult with the is_final flag + set to true. + + :type interim_results: boolean + :param interim_results: [Optional] If true, interim results (tentative + hypotheses) may be returned as they become + available (these interim results are indicated + with the is_final=false flag). If false or + omitted, only is_final=true result(s) are + returned. + """ + config_request = _make_streaming_config( + sample, language_code=language_code, max_alternatives=max_alternatives, + profanity_filter=profanity_filter, speech_context=speech_context, + single_utterance=single_utterance, interim_results=interim_results) + + # The config request MUST go first and not contain any audio data. + yield config_request + + buff = b'' + for data in sample.stream: + # Optimize the request data size to around 100ms. + if len(buff) + len(data) >= sample.chunk_size: + yield StreamingRecognizeRequest(audio_content=buff) + buff = data + else: + buff += data + + # Clear final contents of buffer. + yield StreamingRecognizeRequest(audio_content=buff) + + +def _make_streaming_config(sample, language_code, + max_alternatives, profanity_filter, + speech_context, single_utterance, + interim_results): + """Build streaming configuration. + + :type sample: :class:`~google.cloud.speech.sample.Sample` + :param sample: Instance of ``Sample`` containing audio information. + + :type language_code: str + :param language_code: (Optional) The language of the supplied audio as + BCP-47 language tag. Example: ``'en-GB'``. + If omitted, defaults to ``'en-US'``. + + :type max_alternatives: int + :param max_alternatives: (Optional) Maximum number of recognition + hypotheses to be returned. The server may + return fewer than maxAlternatives. + Valid values are 0-30. A value of 0 or 1 + will return a maximum of 1. Defaults to 1 + + :type profanity_filter: bool + :param profanity_filter: If True, the server will attempt to filter + out profanities, replacing all but the + initial character in each filtered word with + asterisks, e.g. ``'f***'``. If False or + omitted, profanities won't be filtered out. + + :type speech_context: list + :param speech_context: A list of strings (max 50) containing words and + phrases "hints" so that the speech recognition + is more likely to recognize them. This can be + used to improve the accuracy for specific words + and phrases. This can also be used to add new + words to the vocabulary of the recognizer. + + :type single_utterance: boolean + :param single_utterance: [Optional] If false or omitted, the recognizer + will perform continuous recognition + (continuing to process audio even if the user + pauses speaking) until the client closes the + output stream (gRPC API) or when the maximum + time limit has been reached. Multiple + SpeechRecognitionResults with the is_final + flag set to true may be returned. + + If true, the recognizer will detect a single + spoken utterance. When it detects that the + user has paused or stopped speaking, it will + return an END_OF_UTTERANCE event and cease + recognition. It will return no more than one + SpeechRecognitionResult with the is_final flag + set to true. + + :type interim_results: boolean + :param interim_results: [Optional] If true, interim results (tentative + hypotheses) may be returned as they become + available (these interim results are indicated + with the is_final=false flag). If false or + omitted, only is_final=true result(s) are + returned. + + :rtype: :class:`~StreamingRecognitionConfig` + :returns: Instance of ``StreamingRecognitionConfig``. + """ + config = RecognitionConfig( + encoding=sample.encoding, sample_rate=sample.sample_rate, + language_code=language_code, max_alternatives=max_alternatives, + profanity_filter=profanity_filter, speech_context=speech_context) + + streaming_config = StreamingRecognitionConfig( + config=config, single_utterance=single_utterance, + interim_results=interim_results) + + config_request = StreamingRecognizeRequest( + streaming_config=streaming_config) + + return config_request diff --git a/speech/google/cloud/speech/streaming/response.py b/speech/google/cloud/speech/streaming/response.py new file mode 100644 index 000000000000..ec9428985913 --- /dev/null +++ b/speech/google/cloud/speech/streaming/response.py @@ -0,0 +1,77 @@ +# Copyright 2016 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Representation of a GAPIC Speech API response.""" + +from google.cloud.speech.streaming.result import StreamingSpeechResult + + +class StreamingSpeechResponse(object): + """Representation of a Speech API protobuf streaming response. + + :type error: :class:`google.grpc.Status` + :param error: Instance of ``Status`` + + :type endpointer_type: :class:`~EndpointerType` + :param endpointer_type: Enum of endpointer event. + + :type results: list of + :class:`google.cloud.speech.v1beta1.StreamingRecognitionResult` + :param results: List of protobuf ``StreamingRecognitionResult``. + + :type result_index: int + :param result_index: Index for specific result set. Used for updating with + ``interim_results``. + """ + def __init__(self, error, endpointer_type, results, result_index): + self._error = error + self._endpointer_type = endpointer_type # Should be enum. + self._result_index = result_index + self._results = [StreamingSpeechResult.from_pb(result) + for result in results] + + @classmethod + def from_pb(cls, pb_response): + """Factory: construct a ``StreamingSpeechResponse`` from protobuf. + + :type pb_response: + :class:`google.cloud.speech.v1beta1.StreamingRecognizeResponse` + :param pb_response: Instance of protobuf + ``StreamingRecognizeResponse``. + :rtype: :class:`~StreamingSpeechResponse` + :returns: Instance of ``StreamingSpeechResponse``. + """ + error = pb_response.error + endpointer_type = pb_response.endpointer_type + results = pb_response.results + result_index = pb_response.result_index + return cls(error, endpointer_type, results, result_index) + + @property + def result_index(self): + """Result index associated with this response. + + :rtype: int + :returns: Result index of this response. + """ + return self._result_index + + @property + def results(self): + """List of results for this response. + + :rtype: list of :class:`~result.StreamingSpeechResult` + :returns: List of ``StreamingSpeechResult`` in this response. + """ + return self._results diff --git a/speech/google/cloud/speech/streaming/result.py b/speech/google/cloud/speech/streaming/result.py new file mode 100644 index 000000000000..104916eda9e0 --- /dev/null +++ b/speech/google/cloud/speech/streaming/result.py @@ -0,0 +1,73 @@ +# Copyright 2016 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Representation of Speech GAPIC API result.""" + +from google.cloud.speech.transcript import Transcript + + +class StreamingSpeechResult(object): + """Factory: contruct streaming speech result. + + :type alternatives: + :class:`google.cloud.speech.v1beta1.SpeechRecognitionAlternative` + :param alternatives: List of ``SpeechRecognitionAlternative``. + + :type is_final: bool + :param is_final: Indicates if the transcription is complete. + + :type stability: float + :param stability: An estimate of the probability that the recognizer will + not change its guess about this interim result. + """ + + def __init__(self, alternatives, is_final, stability): + self._alternatives = [Transcript.from_pb(alternative) + for alternative in alternatives] + self._is_final = is_final + self._stability = stability + + @classmethod + def from_pb(cls, pb_response): + """Factory: construct StreamingSpeechResult from protobuf response. + + :type pb_response: + :class:`google.cloud.speech.v1beta1.StreamingRecognitionResult` + :param pb_response: Instance of ``StreamingRecognitionResult``. + + :rtype: :class:`~result.StreamingSpeechResult` + :returns: Instance of ``StreamingSpeechResult``. + """ + alternatives = pb_response.alternatives + is_final = pb_response.is_final + stability = pb_response.stability + return cls(alternatives, is_final, stability) + + @property + def alternatives(self): + """List of alternative transcripts. + + :rtype: list of :class:`~google.cloud.speech.transcript.Transcript` + :returns: List of ``Transcript`` objects. + """ + return self._alternatives + + @property + def is_final(self): + """Represents an interim result that may change. + + :rtype: bool + :returns: True if the result has completed it's processing. + """ + return self._is_final diff --git a/speech/google/cloud/speech/transcript.py b/speech/google/cloud/speech/transcript.py index bbe915396c5c..800f4e26d45c 100644 --- a/speech/google/cloud/speech/transcript.py +++ b/speech/google/cloud/speech/transcript.py @@ -16,14 +16,30 @@ class Transcript(object): - """Representation of Speech Transcripts + """Representation of Speech Transcripts. - :type result: dict - :param result: Dictionary of transcript and confidence of recognition. + :type transcript: str + :param transcript: String of transcribed data. + + :type confidence: float + :param confidence: The confidence estimate between 0.0 and 1.0. """ - def __init__(self, result): - self._transcript = result.get('transcript') - self._confidence = result.get('confidence') + def __init__(self, transcript, confidence): + self._transcript = transcript + self._confidence = confidence + + @classmethod + def from_pb(cls, transcript): + """Factory: construct ``Transcript`` from protobuf response + + :type transcript: :class:`~SpeechRecognitionAlternative` + :param transcript: Instance of ``SpeechRecognitionAlternative`` + from protobuf. + + :rtype: :class:`~Transcript` + :returns: Instance of ``Transcript``. + """ + return cls(transcript.transcript, transcript.confidence) @property def transcript(self): diff --git a/speech/setup.py b/speech/setup.py index c02aeaad3e9d..c7504e1beac4 100644 --- a/speech/setup.py +++ b/speech/setup.py @@ -51,6 +51,7 @@ REQUIREMENTS = [ 'google-cloud-core >= 0.20.0', + 'gapic-google-cloud-speech-v1beta1 >= 0.11.1' ] setup( diff --git a/speech/unit_tests/streaming/__init__.py b/speech/unit_tests/streaming/__init__.py new file mode 100644 index 000000000000..58e0d9153632 --- /dev/null +++ b/speech/unit_tests/streaming/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2016 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/speech/unit_tests/streaming/test_container.py b/speech/unit_tests/streaming/test_container.py new file mode 100644 index 000000000000..3d1d8bd13c35 --- /dev/null +++ b/speech/unit_tests/streaming/test_container.py @@ -0,0 +1,124 @@ +# Copyright 2016 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + + +class TestStreamingContainer(unittest.TestCase): + def _getTargetClass(self): + from google.cloud.speech.streaming.container import ( + StreamingResponseContainer) + return StreamingResponseContainer + + def _makeOne(self, *args, **kw): + return self._getTargetClass()(*args, **kw) + + def test_ctor(self): + streaming_container = self._makeOne() + self.assertEqual(streaming_container.responses, {}) + streaming_container.add_response(_MockGAPICSpeechResponse()) + self.assertEqual(len(streaming_container.responses), 1) + + def test_is_not_finished(self): + true_result = _MockGAPICSpeechResult() + true_result.is_final = True + + false_result = _MockGAPICSpeechResult() + false_result.is_final = False + + first_response = _MockGAPICSpeechResponse() + first_response.results.append(true_result) + first_response.results.append(false_result) + + second_response = _MockGAPICSpeechResponse() + second_response.results.append(true_result) + second_response.results.append(true_result) + + streaming_container = self._makeOne() + streaming_container.add_response(first_response) + streaming_container.add_response(second_response) + + self.assertFalse(streaming_container.is_finished) + + def test_is_finished(self): + true_result = _MockGAPICSpeechResult() + true_result.is_final = True + + first_response = _MockGAPICSpeechResponse() + first_response.results.append(true_result) + first_response.results.append(true_result) + + second_response = _MockGAPICSpeechResponse() + second_response.results.append(true_result) + second_response.results.append(true_result) + second_response.result_index = 1 + + streaming_container = self._makeOne() + streaming_container.add_response(first_response) + streaming_container.add_response(second_response) + + self.assertTrue(streaming_container.is_finished) + + def test_get_full_text(self): + first_part = _MockGAPICSpeechResultAlternative(transcript='testing') + second_part = _MockGAPICSpeechResultAlternative(transcript=' 1 2 3') + + first_result = _MockGAPICSpeechResult(alternatives=[first_part]) + first_result.is_final = True + + second_result = _MockGAPICSpeechResult(alternatives=[second_part]) + second_result.is_final = True + + response = _MockGAPICSpeechResponse() + response.results.append(first_result) + response.results.append(second_result) + + streaming_container = self._makeOne() + streaming_container.add_response(response) + + self.assertEqual(streaming_container.get_full_text(), 'testing 1 2 3') + + def test_unfinshed_full_test(self): + first_part = _MockGAPICSpeechResultAlternative(transcript='testing') + + first_result = _MockGAPICSpeechResult(alternatives=[first_part]) + first_result.is_final = False + + response = _MockGAPICSpeechResponse() + response.results.append(first_result) + + streaming_container = self._makeOne() + streaming_container.add_response(response) + + self.assertIsNone(streaming_container.get_full_text()) + + +class _MockGAPICSpeechResultAlternative(object): + def __init__(self, transcript='', confidence=0): + self.transcript = transcript + self.confidence = confidence + + +class _MockGAPICSpeechResult(object): + def __init__(self, alternatives=None): + self.alternatives = alternatives or [] + stability = 0 + is_final = False + + +class _MockGAPICSpeechResponse(object): + error = None + endpointer_type = None + results = [] + result_index = 0 diff --git a/speech/unit_tests/streaming/test_request.py b/speech/unit_tests/streaming/test_request.py new file mode 100644 index 000000000000..798a13b0acd3 --- /dev/null +++ b/speech/unit_tests/streaming/test_request.py @@ -0,0 +1,50 @@ +# Copyright 2016 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + + +class TestStreamingSpeechRequestHelpers(unittest.TestCase): + def test_make_request_stream(self): + from io import BytesIO + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + StreamingRecognizeRequest) + from google.cloud.speech.streaming.request import _make_request_stream + from google.cloud.speech.sample import Sample + + stream = BytesIO(b'g' * 1702) # Something bigger than a chunk. + sample = Sample(stream=stream, encoding='LINEAR16') + + request_count = 0 + for req in _make_request_stream(sample): + request_count += 1 + self.assertIsInstance(req, StreamingRecognizeRequest) + self.assertEqual(request_count, 3) + + def test_make_request_stream_short(self): + from io import BytesIO + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + StreamingRecognizeRequest) + from google.cloud.speech.streaming.request import _make_request_stream + from google.cloud.speech.sample import Sample + + stream = BytesIO(b'g' * (1599 * 2)) # Something bigger than a chunk. + sample = Sample(stream=stream, encoding='LINEAR16') + + request_count = 0 + for req in _make_request_stream(sample): + request_count += 1 + self.assertIsInstance(req, StreamingRecognizeRequest) + + self.assertEqual(request_count, 3) diff --git a/speech/unit_tests/streaming/test_response.py b/speech/unit_tests/streaming/test_response.py new file mode 100644 index 000000000000..d31be50dd0f1 --- /dev/null +++ b/speech/unit_tests/streaming/test_response.py @@ -0,0 +1,29 @@ +# Copyright 2016 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + + +class TestStreamingSpeechResponse(unittest.TestCase): + def _getTargetClass(self): + from google.cloud.speech.streaming.response import ( + StreamingSpeechResponse) + return StreamingSpeechResponse + + def _makeOne(self, *args, **kw): + return self._getTargetClass()(*args, **kw) + + def test_ctor(self): + response = self._makeOne({}, 'END_OF_UTTERANCE', [], 0) + self.assertEqual(response.result_index, 0) diff --git a/speech/unit_tests/test_client.py b/speech/unit_tests/test_client.py index 5972a0014eb3..09a67c1fb778 100644 --- a/speech/unit_tests/test_client.py +++ b/speech/unit_tests/test_client.py @@ -195,6 +195,84 @@ def test_async_recognize(self): self.assertFalse(operation.complete) self.assertIsNone(operation.metadata) + def test_streaming_depends_on_gax(self): + from google.cloud.speech import client as MUT + from google.cloud._testing import _Monkey + creds = _Credentials() + client = self._makeOne(credentials=creds) + client.connection = _Connection() + + with _Monkey(MUT, _USE_GAX=False): + with self.assertRaises(EnvironmentError): + client.stream_recognize({}) + + def test_set_speech_api(self): + from google.cloud.speech import client as MUT + from google.cloud._testing import _Monkey + creds = _Credentials() + client = self._makeOne(credentials=creds) + client.connection = _Connection() + + with _Monkey(MUT, SpeechApi=_MockGAPICSpeechAPI): + client._speech_api = None + speech_api = client.speech_api + self.assertIsInstance(speech_api, _MockGAPICSpeechAPI) + + def test_streaming_closed_stream(self): + from io import BytesIO + from google.cloud.speech.encoding import Encoding + + stream = BytesIO(b'Some audio data...') + credentials = _Credentials() + client = self._makeOne(credentials=credentials) + client.connection = _Connection() + client._speech_api = _MockGAPICSpeechAPI() + + stream.close() + sample = client.sample(stream=stream, + encoding=Encoding.LINEAR16, + sample_rate=self.SAMPLE_RATE) + with self.assertRaises(ValueError): + client.stream_recognize(sample) + + def test_stream_recognize(self): + from io import BytesIO + from google.cloud.speech.encoding import Encoding + from google.cloud.speech.streaming.container import ( + StreamingResponseContainer) + + stream = BytesIO(b'Some audio data...') + credentials = _Credentials() + client = self._makeOne(credentials=credentials) + client.connection = _Connection() + client._speech_api = _MockGAPICSpeechAPI() + + sample = client.sample(stream=stream, + encoding=Encoding.LINEAR16, + sample_rate=self.SAMPLE_RATE) + results = client.stream_recognize(sample) + + self.assertIsInstance(results, StreamingResponseContainer) + requests = [] + for req in client.speech_api._requests: + requests.append(req) + self.assertEqual(len(requests), 2) + + +class _MockGAPICSpeechResponse(object): + error = None + endpointer_type = None + results = [] + result_index = 0 + + +class _MockGAPICSpeechAPI(object): + _requests = None + + def streaming_recognize(self, requests): + self._requests = requests + return [None, _MockGAPICSpeechResponse()] + class _Credentials(object): diff --git a/speech/unit_tests/test_transcript.py b/speech/unit_tests/test_transcript.py index b585d6e7429c..6cbf038546b4 100644 --- a/speech/unit_tests/test_transcript.py +++ b/speech/unit_tests/test_transcript.py @@ -26,7 +26,8 @@ def _makeOne(self, *args, **kwargs): def test_ctor(self): from unit_tests._fixtures import OPERATION_COMPLETE_RESPONSE as DATA TRANSCRIPT_DATA = DATA['response']['results'][0]['alternatives'][0] - transcript = self._makeOne(TRANSCRIPT_DATA) + transcript = self._makeOne(TRANSCRIPT_DATA['transcript'], + TRANSCRIPT_DATA['confidence']) self.assertEqual('how old is the Brooklyn Bridge', transcript.transcript) self.assertEqual(0.98267895, transcript.confidence) From 0fef32189c96327ecc19266b591435b88fa51df9 Mon Sep 17 00:00:00 2001 From: Thomas Schultz Date: Thu, 13 Oct 2016 01:07:55 -0400 Subject: [PATCH 2/6] Make API surface more usable. --- docs/speech-usage.rst | 67 +++++----- speech/google/cloud/speech/client.py | 9 +- .../cloud/speech/streaming/container.py | 72 ---------- .../cloud/speech/streaming/endpointer_type.py | 14 ++ .../google/cloud/speech/streaming/response.py | 56 +++++++- .../google/cloud/speech/streaming/result.py | 2 +- speech/unit_tests/streaming/test_container.py | 124 ------------------ speech/unit_tests/streaming/test_response.py | 31 +++++ speech/unit_tests/test_client.py | 31 ++++- 9 files changed, 159 insertions(+), 247 deletions(-) delete mode 100644 speech/google/cloud/speech/streaming/container.py create mode 100644 speech/google/cloud/speech/streaming/endpointer_type.py delete mode 100644 speech/unit_tests/streaming/test_container.py diff --git a/docs/speech-usage.rst b/docs/speech-usage.rst index 6161f1925d09..3e1ada07b6bc 100644 --- a/docs/speech-usage.rst +++ b/docs/speech-usage.rst @@ -157,6 +157,8 @@ data to possible text alternatives on the fly. See: https://cloud.google.com/speech/limits#content +.. code-block:: python + >>> import io >>> from google.cloud import speech >>> from google.cloud.speech.encoding import Encoding @@ -164,17 +166,38 @@ data to possible text alternatives on the fly. >>> with io.open('./hello.wav', 'rb') as stream: >>> sample = client.sample(stream=stream, encoding=Encoding.LINEAR16, ... sample_rate=16000) - >>> stream_container = client.stream_recognize(sample) - >>> print(stream_container) - - >>> print(stream_container.responses) - {0: } - >>> print(stream_container.responses[0].results[0].alternatives[0].confidence) - 0.698092460632 - >>> print(stream_container.is_finished) + >>> for response in client.stream_recognize(sample): + ... print(response.transcript) + hello + ... print(response.is_final) True - >>> print stream_container.get_full_text() + + +By setting ``interim_results`` to true, interim results (tentative hypotheses) +may be returned as they become available (these interim results are indicated +with the is_final=false flag). If false or omitted, only is_final=true +result(s) are returned. + +.. code-block:: python + + >>> import io + >>> from google.cloud import speech + >>> from google.cloud.speech.encoding import Encoding + >>> client = speech.Client() + >>> with io.open('./hello.wav', 'rb') as stream: + >>> sample = client.sample(stream=stream, encoding=Encoding.LINEAR16, + ... sample_rate=16000) + >>> for response in client.stream_recognize(sample, + ... interim_results=True): + ... print(response.transcript) + hell + ... print(response.is_final) + False + ... print(response.transcript) hello + ... print(response.is_final) + True + By default the recognizer will perform continuous recognition (continuing to process audio even if the user pauses speaking) until the client @@ -195,32 +218,6 @@ See: `Single Utterance`_ >>> print(stream_container.get_full_text()) hello - -If ``interim_results`` is set to ``True``, interim results -(tentative hypotheses) may be returned as they become available. - - .. code-block:: python - - >>> with io.open('./hello_pause_goodbye.wav', 'rb') as stream: - >>> sample = client.sample(stream=stream, encoding=Encoding.LINEAR16, - ... sample_rate=16000) - >>> stream_container = client.stream_recognize(sample, - ... interim_results=True) - >>> print(stream_container.get_full_text()) - hello - - >>> sample = client.sample(source_uri='gs://my-bucket/recording.flac', - ... encoding=Encoding.FLAC, - ... sample_rate=44100) - >>> results = client.stream_recognize(sample, interim_results=True) - >>> print(stream_container.responses[0].results[0].alternatives[0].transcript) - how - print(stream_container.responses[1].results[0].alternatives[0].transcript) - hello - >>> print(stream_container.responses[1].results[2].is_final) - True - - .. _Single Utterance: https://cloud.google.com/speech/reference/rpc/google.cloud.speech.v1beta1#streamingrecognitionconfig .. _sync_recognize: https://cloud.google.com/speech/reference/rest/v1beta1/speech/syncrecognize .. _Speech Asynchronous Recognize: https://cloud.google.com/speech/reference/rest/v1beta1/speech/asyncrecognize diff --git a/speech/google/cloud/speech/client.py b/speech/google/cloud/speech/client.py index c9598d16cee4..8809b4eed540 100644 --- a/speech/google/cloud/speech/client.py +++ b/speech/google/cloud/speech/client.py @@ -26,7 +26,7 @@ from google.cloud.speech.operation import Operation from google.cloud.speech.streaming.request import _make_request_stream from google.cloud.speech.sample import Sample -from google.cloud.speech.streaming.container import StreamingResponseContainer +from google.cloud.speech.streaming.response import StreamingSpeechResponse try: from google.cloud.gapic.speech.v1beta1.speech_api import SpeechApi @@ -302,12 +302,9 @@ def stream_recognize(self, sample, language_code=None, single_utterance=single_utterance, interim_results=interim_results) - responses = StreamingResponseContainer() for response in self.speech_api.streaming_recognize(requests): - if response: - responses.add_response(response) - - return responses + if hasattr(response, 'results') or interim_results: + yield StreamingSpeechResponse.from_pb(response) @property def speech_api(self): diff --git a/speech/google/cloud/speech/streaming/container.py b/speech/google/cloud/speech/streaming/container.py deleted file mode 100644 index 52384b9597a2..000000000000 --- a/speech/google/cloud/speech/streaming/container.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright 2016 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Representation of a group of GAPIC Speech API responses.""" - -from google.cloud.speech.streaming.response import StreamingSpeechResponse - - -class StreamingResponseContainer(object): - """Response container to help manage streaming responses. - - :type responses: list of :class:`~response.StreamingSpeechResponse` - :param responses: List of ``StreamingSpeechResponse`` objects. - """ - def __init__(self, responses=None): - self._responses = responses or {} - - def add_response(self, response): - """Add/update response based on the ``result_index``. - - :type response: :class:`~response.StreamingSpeechResponse` - :param response: Instance of ``StreamingSpeechResponse``. - """ - self._responses.update({response.result_index: - StreamingSpeechResponse.from_pb(response)}) - - @property - def responses(self): - """All responses held in container. - - :rtype: list of :class:`~response.StreamingSpeechResponse` - :returns: List of ``StreamingSpeechResponse`` objects. - """ - return self._responses - - @property - def is_finished(self): - """Helper property to determin if all resuls are final. - - :rtype: bool - :returns: True of all captured results are final. - """ - finished = [] - for response in self.responses.values(): - for result in response.results: - finished.append(result.is_final) - return all(finished) - - def get_full_text(self): - """Parse together all transcript results to form complete text. - - :rtype: str - :returns: Complete transcription. - """ - text = None - if self.is_finished: - text = '' - for response in self.responses.values(): - for result in response.results: - text += result.alternatives[0].transcript - return text diff --git a/speech/google/cloud/speech/streaming/endpointer_type.py b/speech/google/cloud/speech/streaming/endpointer_type.py new file mode 100644 index 000000000000..987775a6a75d --- /dev/null +++ b/speech/google/cloud/speech/streaming/endpointer_type.py @@ -0,0 +1,14 @@ +class EndpointerType(object): + ENDPOINTER_EVENT_UNSPECIFIED = 0 + START_OF_SPEECH = 1 + END_OF_SPEECH = 2 + END_OF_AUDIO = 3 + END_OF_UTTERANCE = 4 + + reverse_map = { + 0: 'ENDPOINTER_EVENT_UNSPECIFIED', + 1: 'START_OF_SPEECH', + 2: 'END_OF_SPEECH', + 3: 'END_OF_AUDIO', + 4: 'END_OF_UTTERANCE' + } diff --git a/speech/google/cloud/speech/streaming/response.py b/speech/google/cloud/speech/streaming/response.py index ec9428985913..4caf39ba186c 100644 --- a/speech/google/cloud/speech/streaming/response.py +++ b/speech/google/cloud/speech/streaming/response.py @@ -14,6 +14,7 @@ """Representation of a GAPIC Speech API response.""" +from google.cloud.speech.streaming.endpointer_type import EndpointerType from google.cloud.speech.streaming.result import StreamingSpeechResult @@ -34,9 +35,12 @@ class StreamingSpeechResponse(object): :param result_index: Index for specific result set. Used for updating with ``interim_results``. """ - def __init__(self, error, endpointer_type, results, result_index): + def __init__(self, error=None, endpointer_type=None, results=None, + result_index=None): + results = results or [] self._error = error - self._endpointer_type = endpointer_type # Should be enum. + self._endpointer_type = EndpointerType.reverse_map.get( + endpointer_type, None) self._result_index = result_index self._results = [StreamingSpeechResult.from_pb(result) for result in results] @@ -56,7 +60,41 @@ def from_pb(cls, pb_response): endpointer_type = pb_response.endpointer_type results = pb_response.results result_index = pb_response.result_index - return cls(error, endpointer_type, results, result_index) + return cls(error=error, endpointer_type=endpointer_type, + results=results, result_index=result_index) + + @property + def confidence(self): + """Confidence score for recognized speech. + + :rtype: float + :returns: Confidence score of recognized speech [0.0-1.0]. + """ + if self.results and self.results[0].alternatives: + return self.results[0].alternatives[0].confidence + else: + return 0.0 + + @property + def endpointer_type(self): + """Endpointer indicating the state of the speech detection. + + :rtype: str + :returns: String derived from :class:`~endpointer_type.EndpointerType`. + """ + return self._endpointer_type + + @property + def is_final(self): + """Represents an interim result that may change. + + :rtype: bool + :returns: True if the result has completed it's processing. + """ + if len(self.results): + return self.results[0].is_final + else: + return False @property def result_index(self): @@ -75,3 +113,15 @@ def results(self): :returns: List of ``StreamingSpeechResult`` in this response. """ return self._results + + @property + def transcript(self): + """Get most likely transcript from response. + + :rtype: str + :returns: Transcript text from response. + """ + if self.results and self.results[0].alternatives: + return self.results[0].alternatives[0].transcript + else: + return '' diff --git a/speech/google/cloud/speech/streaming/result.py b/speech/google/cloud/speech/streaming/result.py index 104916eda9e0..6cfc37c18ad5 100644 --- a/speech/google/cloud/speech/streaming/result.py +++ b/speech/google/cloud/speech/streaming/result.py @@ -70,4 +70,4 @@ def is_final(self): :rtype: bool :returns: True if the result has completed it's processing. """ - return self._is_final + return bool(self._is_final) diff --git a/speech/unit_tests/streaming/test_container.py b/speech/unit_tests/streaming/test_container.py deleted file mode 100644 index 3d1d8bd13c35..000000000000 --- a/speech/unit_tests/streaming/test_container.py +++ /dev/null @@ -1,124 +0,0 @@ -# Copyright 2016 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - - -class TestStreamingContainer(unittest.TestCase): - def _getTargetClass(self): - from google.cloud.speech.streaming.container import ( - StreamingResponseContainer) - return StreamingResponseContainer - - def _makeOne(self, *args, **kw): - return self._getTargetClass()(*args, **kw) - - def test_ctor(self): - streaming_container = self._makeOne() - self.assertEqual(streaming_container.responses, {}) - streaming_container.add_response(_MockGAPICSpeechResponse()) - self.assertEqual(len(streaming_container.responses), 1) - - def test_is_not_finished(self): - true_result = _MockGAPICSpeechResult() - true_result.is_final = True - - false_result = _MockGAPICSpeechResult() - false_result.is_final = False - - first_response = _MockGAPICSpeechResponse() - first_response.results.append(true_result) - first_response.results.append(false_result) - - second_response = _MockGAPICSpeechResponse() - second_response.results.append(true_result) - second_response.results.append(true_result) - - streaming_container = self._makeOne() - streaming_container.add_response(first_response) - streaming_container.add_response(second_response) - - self.assertFalse(streaming_container.is_finished) - - def test_is_finished(self): - true_result = _MockGAPICSpeechResult() - true_result.is_final = True - - first_response = _MockGAPICSpeechResponse() - first_response.results.append(true_result) - first_response.results.append(true_result) - - second_response = _MockGAPICSpeechResponse() - second_response.results.append(true_result) - second_response.results.append(true_result) - second_response.result_index = 1 - - streaming_container = self._makeOne() - streaming_container.add_response(first_response) - streaming_container.add_response(second_response) - - self.assertTrue(streaming_container.is_finished) - - def test_get_full_text(self): - first_part = _MockGAPICSpeechResultAlternative(transcript='testing') - second_part = _MockGAPICSpeechResultAlternative(transcript=' 1 2 3') - - first_result = _MockGAPICSpeechResult(alternatives=[first_part]) - first_result.is_final = True - - second_result = _MockGAPICSpeechResult(alternatives=[second_part]) - second_result.is_final = True - - response = _MockGAPICSpeechResponse() - response.results.append(first_result) - response.results.append(second_result) - - streaming_container = self._makeOne() - streaming_container.add_response(response) - - self.assertEqual(streaming_container.get_full_text(), 'testing 1 2 3') - - def test_unfinshed_full_test(self): - first_part = _MockGAPICSpeechResultAlternative(transcript='testing') - - first_result = _MockGAPICSpeechResult(alternatives=[first_part]) - first_result.is_final = False - - response = _MockGAPICSpeechResponse() - response.results.append(first_result) - - streaming_container = self._makeOne() - streaming_container.add_response(response) - - self.assertIsNone(streaming_container.get_full_text()) - - -class _MockGAPICSpeechResultAlternative(object): - def __init__(self, transcript='', confidence=0): - self.transcript = transcript - self.confidence = confidence - - -class _MockGAPICSpeechResult(object): - def __init__(self, alternatives=None): - self.alternatives = alternatives or [] - stability = 0 - is_final = False - - -class _MockGAPICSpeechResponse(object): - error = None - endpointer_type = None - results = [] - result_index = 0 diff --git a/speech/unit_tests/streaming/test_response.py b/speech/unit_tests/streaming/test_response.py index d31be50dd0f1..d2a695d3f14d 100644 --- a/speech/unit_tests/streaming/test_response.py +++ b/speech/unit_tests/streaming/test_response.py @@ -27,3 +27,34 @@ def _makeOne(self, *args, **kw): def test_ctor(self): response = self._makeOne({}, 'END_OF_UTTERANCE', [], 0) self.assertEqual(response.result_index, 0) + self.assertEqual(response.confidence, 0.0) + self.assertEqual(response.endpointer_type, None) + self.assertEqual(response.results, []) + self.assertEqual(response.transcript, '') + self.assertFalse(response.is_final) + + def test_from_pb(self): + response = self._makeOne() + res = response.from_pb(_MockSpeechPBResponse) + self.assertFalse(res.is_final) + self.assertEqual(res.endpointer_type, 'END_OF_AUDIO') + self.assertEqual(res.transcript, 'hello there!') + self.assertEqual(res.confidence, 0.9704365) + + +class _MockSpeechPBAlternative(object): + transcript = 'hello there!' + confidence = 0.9704365 + + +class _MockSpeechPBResult(object): + alternatives = [_MockSpeechPBAlternative()] + is_final = False + stability = 0.0 + + +class _MockSpeechPBResponse(object): + error = {} + endpointer_type = 3 + result_index = 0 + results = [_MockSpeechPBResult, _MockSpeechPBResult] diff --git a/speech/unit_tests/test_client.py b/speech/unit_tests/test_client.py index 09a67c1fb778..9d3176c56ba9 100644 --- a/speech/unit_tests/test_client.py +++ b/speech/unit_tests/test_client.py @@ -204,7 +204,7 @@ def test_streaming_depends_on_gax(self): with _Monkey(MUT, _USE_GAX=False): with self.assertRaises(EnvironmentError): - client.stream_recognize({}) + next(client.stream_recognize({})) def test_set_speech_api(self): from google.cloud.speech import client as MUT @@ -233,13 +233,31 @@ def test_streaming_closed_stream(self): encoding=Encoding.LINEAR16, sample_rate=self.SAMPLE_RATE) with self.assertRaises(ValueError): - client.stream_recognize(sample) + next(client.stream_recognize(sample)) + + def test_streaming_with_empty_response(self): + from io import BytesIO + from google.cloud.speech.encoding import Encoding + + stream = BytesIO(b'Some audio data...') + credentials = _Credentials() + client = self._makeOne(credentials=credentials) + client.connection = _Connection() + client._speech_api = _MockGAPICSpeechAPI() + client._speech_api._responses = [] + + sample = client.sample(stream=stream, + encoding=Encoding.LINEAR16, + sample_rate=self.SAMPLE_RATE) + results = client.stream_recognize(sample) + with self.assertRaises(StopIteration): + next(results) def test_stream_recognize(self): from io import BytesIO from google.cloud.speech.encoding import Encoding - from google.cloud.speech.streaming.container import ( - StreamingResponseContainer) + from google.cloud.speech.streaming.response import ( + StreamingSpeechResponse) stream = BytesIO(b'Some audio data...') credentials = _Credentials() @@ -252,7 +270,7 @@ def test_stream_recognize(self): sample_rate=self.SAMPLE_RATE) results = client.stream_recognize(sample) - self.assertIsInstance(results, StreamingResponseContainer) + self.assertIsInstance(next(results), StreamingSpeechResponse) requests = [] for req in client.speech_api._requests: requests.append(req) @@ -268,10 +286,11 @@ class _MockGAPICSpeechResponse(object): class _MockGAPICSpeechAPI(object): _requests = None + _responses = [None, _MockGAPICSpeechResponse()] def streaming_recognize(self, requests): self._requests = requests - return [None, _MockGAPICSpeechResponse()] + return self._responses class _Credentials(object): From a6cc2c79bbca746284284af76b36cb7ae7cd6b32 Mon Sep 17 00:00:00 2001 From: Thomas Schultz Date: Thu, 13 Oct 2016 01:49:41 -0400 Subject: [PATCH 3/6] Alternate strategy,to just pass back responses. --- docs/speech-streaming.rst | 28 +-- docs/speech-usage.rst | 26 +-- speech/google/cloud/speech/__init__.py | 1 + speech/google/cloud/speech/client.py | 175 ++++++++++++++++- .../__init__.py => endpointer_type.py} | 0 speech/google/cloud/speech/sample.py | 2 +- .../cloud/speech/streaming/endpointer_type.py | 14 -- .../google/cloud/speech/streaming/request.py | 183 ------------------ .../response.py => streaming_response.py} | 33 +++- .../result.py => streaming_result.py} | 0 speech/unit_tests/streaming/__init__.py | 13 -- speech/unit_tests/test_client.py | 19 +- .../{streaming => }/test_request.py | 8 +- .../{streaming => }/test_response.py | 2 +- 14 files changed, 219 insertions(+), 285 deletions(-) rename speech/google/cloud/speech/{streaming/__init__.py => endpointer_type.py} (100%) delete mode 100644 speech/google/cloud/speech/streaming/endpointer_type.py delete mode 100644 speech/google/cloud/speech/streaming/request.py rename speech/google/cloud/speech/{streaming/response.py => streaming_response.py} (83%) rename speech/google/cloud/speech/{streaming/result.py => streaming_result.py} (100%) delete mode 100644 speech/unit_tests/streaming/__init__.py rename speech/unit_tests/{streaming => }/test_request.py (86%) rename speech/unit_tests/{streaming => }/test_response.py (96%) diff --git a/docs/speech-streaming.rst b/docs/speech-streaming.rst index 6fa42dbd9f79..eab505b4d06d 100644 --- a/docs/speech-streaming.rst +++ b/docs/speech-streaming.rst @@ -1,33 +1,23 @@ -Speech StreamingResponseContainer -================================= +Streaming Speech Response +========================= -.. automodule:: google.cloud.speech.streaming.container +.. automodule:: google.cloud.speech.streaming_response :members: :undoc-members: :show-inheritance: -Speech Streaming Request helpers -================================ +Streaming Speech Result +======================= -.. automodule:: google.cloud.speech.streaming.request +.. automodule:: google.cloud.speech.streaming_result :members: :undoc-members: :show-inheritance: -Speech StreamingSpeechResponse -============================== +Streaming Endpointer Type +========================= -.. automodule:: google.cloud.speech.streaming.response - :members: - :undoc-members: - :show-inheritance: - - - -Speech StreamingSpeechResult -============================ - -.. automodule:: google.cloud.speech.streaming.result +.. automodule:: google.cloud.speech.endpointer_type :members: :undoc-members: :show-inheritance: diff --git a/docs/speech-usage.rst b/docs/speech-usage.rst index 3e1ada07b6bc..8b649fae9dce 100644 --- a/docs/speech-usage.rst +++ b/docs/speech-usage.rst @@ -51,10 +51,9 @@ See: `Speech Asynchronous Recognize`_ >>> import time >>> from google.cloud import speech - >>> from google.cloud.speech.encoding import Encoding >>> client = speech.Client() >>> sample = client.sample(source_uri='gs://my-bucket/recording.flac', - ... encoding=Encoding.LINEAR16, + ... encoding=speech.Encoding.LINEAR16, ... sample_rate=44100) >>> operation = client.async_recognize(sample, max_alternatives=2) >>> retry_count = 100 @@ -82,10 +81,9 @@ Great Britian. .. code-block:: python >>> from google.cloud import speech - >>> from google.cloud.speech.encoding import Encoding >>> client = speech.Client() >>> sample = client.sample(source_uri='gs://my-bucket/recording.flac', - ... encoding=Encoding.FLAC, + ... encoding=speech.Encoding.FLAC, ... sample_rate=44100) >>> operation = client.async_recognize(sample, max_alternatives=2) >>> alternatives = client.sync_recognize( @@ -107,10 +105,9 @@ Example of using the profanity filter. .. code-block:: python >>> from google.cloud import speech - >>> from google.cloud.speech.encoding import Encoding >>> client = speech.Client() >>> sample = client.sample(source_uri='gs://my-bucket/recording.flac', - ... encoding=Encoding.FLAC, + ... encoding=speech.Encoding.FLAC, ... sample_rate=44100) >>> alternatives = client.sync_recognize(sample, max_alternatives=1, ... profanity_filter=True) @@ -129,10 +126,9 @@ words to the vocabulary of the recognizer. .. code-block:: python >>> from google.cloud import speech - >>> from google.cloud.speech.encoding import Encoding >>> client = speech.Client() >>> sample = client.sample(source_uri='gs://my-bucket/recording.flac', - ... encoding=Encoding.FLAC, + ... encoding=speech.Encoding.FLAC, ... sample_rate=44100) >>> hints = ['hi', 'good afternoon'] >>> alternatives = client.sync_recognize(sample, max_alternatives=2, @@ -161,12 +157,11 @@ data to possible text alternatives on the fly. >>> import io >>> from google.cloud import speech - >>> from google.cloud.speech.encoding import Encoding >>> client = speech.Client() >>> with io.open('./hello.wav', 'rb') as stream: - >>> sample = client.sample(stream=stream, encoding=Encoding.LINEAR16, + ... sample = client.sample(stream=stream, encoding=speech.Encoding.LINEAR16, ... sample_rate=16000) - >>> for response in client.stream_recognize(sample): + ... for response in client.stream_recognize(sample): ... print(response.transcript) hello ... print(response.is_final) @@ -182,12 +177,11 @@ result(s) are returned. >>> import io >>> from google.cloud import speech - >>> from google.cloud.speech.encoding import Encoding >>> client = speech.Client() >>> with io.open('./hello.wav', 'rb') as stream: - >>> sample = client.sample(stream=stream, encoding=Encoding.LINEAR16, + >>> sample = client.sample(stream=stream, encoding=speech.Encoding.LINEAR16, ... sample_rate=16000) - >>> for response in client.stream_recognize(sample, + ... for response in client.stream_recognize(sample, ... interim_results=True): ... print(response.transcript) hell @@ -211,9 +205,9 @@ See: `Single Utterance`_ .. code-block:: python >>> with io.open('./hello_pause_goodbye.wav', 'rb') as stream: - >>> sample = client.sample(stream=stream, encoding=Encoding.LINEAR16, + >>> sample = client.sample(stream=stream, encoding=speech.Encoding.LINEAR16, ... sample_rate=16000) - >>> stream_container = client.stream_recognize(sample, + ... stream_container = client.stream_recognize(sample, ... single_utterance=True) >>> print(stream_container.get_full_text()) hello diff --git a/speech/google/cloud/speech/__init__.py b/speech/google/cloud/speech/__init__.py index ef55810893a7..4a9e4e4f6fc6 100644 --- a/speech/google/cloud/speech/__init__.py +++ b/speech/google/cloud/speech/__init__.py @@ -16,3 +16,4 @@ from google.cloud.speech.client import Client from google.cloud.speech.connection import Connection +from google.cloud.speech.encoding import Encoding diff --git a/speech/google/cloud/speech/client.py b/speech/google/cloud/speech/client.py index 8809b4eed540..39b29a808761 100644 --- a/speech/google/cloud/speech/client.py +++ b/speech/google/cloud/speech/client.py @@ -24,12 +24,17 @@ from google.cloud.speech.connection import Connection from google.cloud.speech.encoding import Encoding from google.cloud.speech.operation import Operation -from google.cloud.speech.streaming.request import _make_request_stream from google.cloud.speech.sample import Sample -from google.cloud.speech.streaming.response import StreamingSpeechResponse +from google.cloud.speech.streaming_response import StreamingSpeechResponse try: from google.cloud.gapic.speech.v1beta1.speech_api import SpeechApi + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + RecognitionConfig) + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + StreamingRecognitionConfig) + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + StreamingRecognizeRequest) except ImportError: # pragma: NO COVER _HAVE_GAX = False else: @@ -284,16 +289,9 @@ def stream_recognize(self, sample, language_code=None, with the is_final=false flag). If false or omitted, only is_final=true result(s) are returned. - - :rtype: :class:`~streaming.StreamingResponseContainer` - :returns: An instance of ``StreamingReponseContainer``. - """ if not _USE_GAX: - raise EnvironmentError('GRPC is required to use this API.') - - if sample.stream.closed: - raise ValueError('Stream is closed.') + raise EnvironmentError('gRPC is required to use this API.') requests = _make_request_stream(sample, language_code=language_code, max_alternatives=max_alternatives, @@ -379,3 +377,160 @@ def _build_request_data(sample, language_code=None, max_alternatives=None, } return data + + +def _make_request_stream(sample, language_code=None, max_alternatives=None, + profanity_filter=None, speech_context=None, + single_utterance=None, interim_results=None): + """Generate stream of requests from sample. + + :type sample: :class:`~google.cloud.speech.sample.Sample` + :param sample: Instance of ``Sample`` containing audio information. + + :type language_code: str + :param language_code: (Optional) The language of the supplied audio as + BCP-47 language tag. Example: ``'en-GB'``. + If omitted, defaults to ``'en-US'``. + + :type max_alternatives: int + :param max_alternatives: (Optional) Maximum number of recognition + hypotheses to be returned. The server may + return fewer than maxAlternatives. + Valid values are 0-30. A value of 0 or 1 + will return a maximum of 1. Defaults to 1 + + :type profanity_filter: bool + :param profanity_filter: If True, the server will attempt to filter + out profanities, replacing all but the + initial character in each filtered word with + asterisks, e.g. ``'f***'``. If False or + omitted, profanities won't be filtered out. + + :type speech_context: list + :param speech_context: A list of strings (max 50) containing words and + phrases "hints" so that the speech recognition + is more likely to recognize them. This can be + used to improve the accuracy for specific words + and phrases. This can also be used to add new + words to the vocabulary of the recognizer. + + :type single_utterance: boolean + :param single_utterance: [Optional] If false or omitted, the recognizer + will perform continuous recognition + (continuing to process audio even if the user + pauses speaking) until the client closes the + output stream (gRPC API) or when the maximum + time limit has been reached. Multiple + SpeechRecognitionResults with the is_final + flag set to true may be returned. + + If true, the recognizer will detect a single + spoken utterance. When it detects that the + user has paused or stopped speaking, it will + return an END_OF_UTTERANCE event and cease + recognition. It will return no more than one + SpeechRecognitionResult with the is_final flag + set to true. + + :type interim_results: boolean + :param interim_results: [Optional] If true, interim results (tentative + hypotheses) may be returned as they become + available (these interim results are indicated + with the is_final=false flag). If false or + omitted, only is_final=true result(s) are + returned. + """ + config_request = _make_streaming_config( + sample, language_code=language_code, max_alternatives=max_alternatives, + profanity_filter=profanity_filter, speech_context=speech_context, + single_utterance=single_utterance, interim_results=interim_results) + + # The config request MUST go first and not contain any audio data. + yield config_request + + while True: + data = sample.stream.read(sample.chunk_size) + if not data: + break + # Optimize the request data size to around 100ms. + yield StreamingRecognizeRequest(audio_content=data) + + +def _make_streaming_config(sample, language_code, + max_alternatives, profanity_filter, + speech_context, single_utterance, + interim_results): + """Build streaming configuration. + + :type sample: :class:`~google.cloud.speech.sample.Sample` + :param sample: Instance of ``Sample`` containing audio information. + + :type language_code: str + :param language_code: (Optional) The language of the supplied audio as + BCP-47 language tag. Example: ``'en-GB'``. + If omitted, defaults to ``'en-US'``. + + :type max_alternatives: int + :param max_alternatives: (Optional) Maximum number of recognition + hypotheses to be returned. The server may + return fewer than maxAlternatives. + Valid values are 0-30. A value of 0 or 1 + will return a maximum of 1. Defaults to 1 + + :type profanity_filter: bool + :param profanity_filter: If True, the server will attempt to filter + out profanities, replacing all but the + initial character in each filtered word with + asterisks, e.g. ``'f***'``. If False or + omitted, profanities won't be filtered out. + + :type speech_context: list + :param speech_context: A list of strings (max 50) containing words and + phrases "hints" so that the speech recognition + is more likely to recognize them. This can be + used to improve the accuracy for specific words + and phrases. This can also be used to add new + words to the vocabulary of the recognizer. + + :type single_utterance: boolean + :param single_utterance: [Optional] If false or omitted, the recognizer + will perform continuous recognition + (continuing to process audio even if the user + pauses speaking) until the client closes the + output stream (gRPC API) or when the maximum + time limit has been reached. Multiple + SpeechRecognitionResults with the is_final + flag set to true may be returned. + + If true, the recognizer will detect a single + spoken utterance. When it detects that the + user has paused or stopped speaking, it will + return an END_OF_UTTERANCE event and cease + recognition. It will return no more than one + SpeechRecognitionResult with the is_final flag + set to true. + + :type interim_results: boolean + :param interim_results: [Optional] If true, interim results (tentative + hypotheses) may be returned as they become + available (these interim results are indicated + with the is_final=false flag). If false or + omitted, only is_final=true result(s) are + returned. + + :rtype: :class:`~StreamingRecognitionConfig` + :returns: Instance of ``StreamingRecognitionConfig``. + """ + config = RecognitionConfig( + encoding=sample.encoding, sample_rate=sample.sample_rate, + language_code=language_code, max_alternatives=max_alternatives, + profanity_filter=profanity_filter, speech_context=speech_context) + + streaming_config = StreamingRecognitionConfig( + config=config, single_utterance=single_utterance, + interim_results=interim_results) + + config_request = StreamingRecognizeRequest( + streaming_config=streaming_config) + + return config_request diff --git a/speech/google/cloud/speech/streaming/__init__.py b/speech/google/cloud/speech/endpointer_type.py similarity index 100% rename from speech/google/cloud/speech/streaming/__init__.py rename to speech/google/cloud/speech/endpointer_type.py diff --git a/speech/google/cloud/speech/sample.py b/speech/google/cloud/speech/sample.py index 5b1608f80e79..fbc97adf1d0d 100644 --- a/speech/google/cloud/speech/sample.py +++ b/speech/google/cloud/speech/sample.py @@ -52,7 +52,7 @@ class Sample(object): def __init__(self, content=None, source_uri=None, stream=None, encoding=None, sample_rate=None): - if [content, source_uri, stream].count(None) != 2: + if (content, source_uri, stream).count(None) != 2: raise ValueError('Supply only one of \'content\', \'source_uri\'' ' or stream.') diff --git a/speech/google/cloud/speech/streaming/endpointer_type.py b/speech/google/cloud/speech/streaming/endpointer_type.py deleted file mode 100644 index 987775a6a75d..000000000000 --- a/speech/google/cloud/speech/streaming/endpointer_type.py +++ /dev/null @@ -1,14 +0,0 @@ -class EndpointerType(object): - ENDPOINTER_EVENT_UNSPECIFIED = 0 - START_OF_SPEECH = 1 - END_OF_SPEECH = 2 - END_OF_AUDIO = 3 - END_OF_UTTERANCE = 4 - - reverse_map = { - 0: 'ENDPOINTER_EVENT_UNSPECIFIED', - 1: 'START_OF_SPEECH', - 2: 'END_OF_SPEECH', - 3: 'END_OF_AUDIO', - 4: 'END_OF_UTTERANCE' - } diff --git a/speech/google/cloud/speech/streaming/request.py b/speech/google/cloud/speech/streaming/request.py deleted file mode 100644 index 794e3f992a67..000000000000 --- a/speech/google/cloud/speech/streaming/request.py +++ /dev/null @@ -1,183 +0,0 @@ -# Copyright 2016 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Helper to make Speech requests from IO stream""" - -from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import RecognitionConfig -from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( - StreamingRecognitionConfig) -from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( - StreamingRecognizeRequest) - - -def _make_request_stream(sample, language_code=None, max_alternatives=None, - profanity_filter=None, speech_context=None, - single_utterance=None, interim_results=None): - """Generate stream of requests from sample. - - :type sample: :class:`~google.cloud.speech.sample.Sample` - :param sample: Instance of ``Sample`` containing audio information. - - :type language_code: str - :param language_code: (Optional) The language of the supplied audio as - BCP-47 language tag. Example: ``'en-GB'``. - If omitted, defaults to ``'en-US'``. - - :type max_alternatives: int - :param max_alternatives: (Optional) Maximum number of recognition - hypotheses to be returned. The server may - return fewer than maxAlternatives. - Valid values are 0-30. A value of 0 or 1 - will return a maximum of 1. Defaults to 1 - - :type profanity_filter: bool - :param profanity_filter: If True, the server will attempt to filter - out profanities, replacing all but the - initial character in each filtered word with - asterisks, e.g. ``'f***'``. If False or - omitted, profanities won't be filtered out. - - :type speech_context: list - :param speech_context: A list of strings (max 50) containing words and - phrases "hints" so that the speech recognition - is more likely to recognize them. This can be - used to improve the accuracy for specific words - and phrases. This can also be used to add new - words to the vocabulary of the recognizer. - - :type single_utterance: boolean - :param single_utterance: [Optional] If false or omitted, the recognizer - will perform continuous recognition - (continuing to process audio even if the user - pauses speaking) until the client closes the - output stream (gRPC API) or when the maximum - time limit has been reached. Multiple - SpeechRecognitionResults with the is_final - flag set to true may be returned. - - If true, the recognizer will detect a single - spoken utterance. When it detects that the - user has paused or stopped speaking, it will - return an END_OF_UTTERANCE event and cease - recognition. It will return no more than one - SpeechRecognitionResult with the is_final flag - set to true. - - :type interim_results: boolean - :param interim_results: [Optional] If true, interim results (tentative - hypotheses) may be returned as they become - available (these interim results are indicated - with the is_final=false flag). If false or - omitted, only is_final=true result(s) are - returned. - """ - config_request = _make_streaming_config( - sample, language_code=language_code, max_alternatives=max_alternatives, - profanity_filter=profanity_filter, speech_context=speech_context, - single_utterance=single_utterance, interim_results=interim_results) - - # The config request MUST go first and not contain any audio data. - yield config_request - - buff = b'' - for data in sample.stream: - # Optimize the request data size to around 100ms. - if len(buff) + len(data) >= sample.chunk_size: - yield StreamingRecognizeRequest(audio_content=buff) - buff = data - else: - buff += data - - # Clear final contents of buffer. - yield StreamingRecognizeRequest(audio_content=buff) - - -def _make_streaming_config(sample, language_code, - max_alternatives, profanity_filter, - speech_context, single_utterance, - interim_results): - """Build streaming configuration. - - :type sample: :class:`~google.cloud.speech.sample.Sample` - :param sample: Instance of ``Sample`` containing audio information. - - :type language_code: str - :param language_code: (Optional) The language of the supplied audio as - BCP-47 language tag. Example: ``'en-GB'``. - If omitted, defaults to ``'en-US'``. - - :type max_alternatives: int - :param max_alternatives: (Optional) Maximum number of recognition - hypotheses to be returned. The server may - return fewer than maxAlternatives. - Valid values are 0-30. A value of 0 or 1 - will return a maximum of 1. Defaults to 1 - - :type profanity_filter: bool - :param profanity_filter: If True, the server will attempt to filter - out profanities, replacing all but the - initial character in each filtered word with - asterisks, e.g. ``'f***'``. If False or - omitted, profanities won't be filtered out. - - :type speech_context: list - :param speech_context: A list of strings (max 50) containing words and - phrases "hints" so that the speech recognition - is more likely to recognize them. This can be - used to improve the accuracy for specific words - and phrases. This can also be used to add new - words to the vocabulary of the recognizer. - - :type single_utterance: boolean - :param single_utterance: [Optional] If false or omitted, the recognizer - will perform continuous recognition - (continuing to process audio even if the user - pauses speaking) until the client closes the - output stream (gRPC API) or when the maximum - time limit has been reached. Multiple - SpeechRecognitionResults with the is_final - flag set to true may be returned. - - If true, the recognizer will detect a single - spoken utterance. When it detects that the - user has paused or stopped speaking, it will - return an END_OF_UTTERANCE event and cease - recognition. It will return no more than one - SpeechRecognitionResult with the is_final flag - set to true. - - :type interim_results: boolean - :param interim_results: [Optional] If true, interim results (tentative - hypotheses) may be returned as they become - available (these interim results are indicated - with the is_final=false flag). If false or - omitted, only is_final=true result(s) are - returned. - - :rtype: :class:`~StreamingRecognitionConfig` - :returns: Instance of ``StreamingRecognitionConfig``. - """ - config = RecognitionConfig( - encoding=sample.encoding, sample_rate=sample.sample_rate, - language_code=language_code, max_alternatives=max_alternatives, - profanity_filter=profanity_filter, speech_context=speech_context) - - streaming_config = StreamingRecognitionConfig( - config=config, single_utterance=single_utterance, - interim_results=interim_results) - - config_request = StreamingRecognizeRequest( - streaming_config=streaming_config) - - return config_request diff --git a/speech/google/cloud/speech/streaming/response.py b/speech/google/cloud/speech/streaming_response.py similarity index 83% rename from speech/google/cloud/speech/streaming/response.py rename to speech/google/cloud/speech/streaming_response.py index 4caf39ba186c..55e2321f1a1d 100644 --- a/speech/google/cloud/speech/streaming/response.py +++ b/speech/google/cloud/speech/streaming_response.py @@ -14,8 +14,7 @@ """Representation of a GAPIC Speech API response.""" -from google.cloud.speech.streaming.endpointer_type import EndpointerType -from google.cloud.speech.streaming.result import StreamingSpeechResult +from google.cloud.speech.streaming_result import StreamingSpeechResult class StreamingSpeechResponse(object): @@ -71,7 +70,7 @@ def confidence(self): :returns: Confidence score of recognized speech [0.0-1.0]. """ if self.results and self.results[0].alternatives: - return self.results[0].alternatives[0].confidence + return self.results[0].alternatives[0].confidence else: return 0.0 @@ -91,8 +90,8 @@ def is_final(self): :rtype: bool :returns: True if the result has completed it's processing. """ - if len(self.results): - return self.results[0].is_final + if self.results: + return bool(self.results[0].is_final) else: return False @@ -122,6 +121,28 @@ def transcript(self): :returns: Transcript text from response. """ if self.results and self.results[0].alternatives: - return self.results[0].alternatives[0].transcript + return self.results[0].alternatives[0].transcript else: return '' + + +class EndpointerType(object): + """Endpointer type for tracking state of Speech API detection. + + See: + https://cloud.google.com/speech/reference/rpc/\ + google.cloud.speech.v1beta1#endpointertype + """ + ENDPOINTER_EVENT_UNSPECIFIED = 0 + START_OF_SPEECH = 1 + END_OF_SPEECH = 2 + END_OF_AUDIO = 3 + END_OF_UTTERANCE = 4 + + reverse_map = { + 0: 'ENDPOINTER_EVENT_UNSPECIFIED', + 1: 'START_OF_SPEECH', + 2: 'END_OF_SPEECH', + 3: 'END_OF_AUDIO', + 4: 'END_OF_UTTERANCE' + } diff --git a/speech/google/cloud/speech/streaming/result.py b/speech/google/cloud/speech/streaming_result.py similarity index 100% rename from speech/google/cloud/speech/streaming/result.py rename to speech/google/cloud/speech/streaming_result.py diff --git a/speech/unit_tests/streaming/__init__.py b/speech/unit_tests/streaming/__init__.py deleted file mode 100644 index 58e0d9153632..000000000000 --- a/speech/unit_tests/streaming/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2016 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/speech/unit_tests/test_client.py b/speech/unit_tests/test_client.py index 9d3176c56ba9..b9a436d1e22c 100644 --- a/speech/unit_tests/test_client.py +++ b/speech/unit_tests/test_client.py @@ -218,23 +218,6 @@ def test_set_speech_api(self): speech_api = client.speech_api self.assertIsInstance(speech_api, _MockGAPICSpeechAPI) - def test_streaming_closed_stream(self): - from io import BytesIO - from google.cloud.speech.encoding import Encoding - - stream = BytesIO(b'Some audio data...') - credentials = _Credentials() - client = self._makeOne(credentials=credentials) - client.connection = _Connection() - client._speech_api = _MockGAPICSpeechAPI() - - stream.close() - sample = client.sample(stream=stream, - encoding=Encoding.LINEAR16, - sample_rate=self.SAMPLE_RATE) - with self.assertRaises(ValueError): - next(client.stream_recognize(sample)) - def test_streaming_with_empty_response(self): from io import BytesIO from google.cloud.speech.encoding import Encoding @@ -256,7 +239,7 @@ def test_streaming_with_empty_response(self): def test_stream_recognize(self): from io import BytesIO from google.cloud.speech.encoding import Encoding - from google.cloud.speech.streaming.response import ( + from google.cloud.speech.streaming_response import ( StreamingSpeechResponse) stream = BytesIO(b'Some audio data...') diff --git a/speech/unit_tests/streaming/test_request.py b/speech/unit_tests/test_request.py similarity index 86% rename from speech/unit_tests/streaming/test_request.py rename to speech/unit_tests/test_request.py index 798a13b0acd3..b536c661aecb 100644 --- a/speech/unit_tests/streaming/test_request.py +++ b/speech/unit_tests/test_request.py @@ -20,7 +20,7 @@ def test_make_request_stream(self): from io import BytesIO from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( StreamingRecognizeRequest) - from google.cloud.speech.streaming.request import _make_request_stream + from google.cloud.speech.client import _make_request_stream from google.cloud.speech.sample import Sample stream = BytesIO(b'g' * 1702) # Something bigger than a chunk. @@ -36,10 +36,10 @@ def test_make_request_stream_short(self): from io import BytesIO from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( StreamingRecognizeRequest) - from google.cloud.speech.streaming.request import _make_request_stream + from google.cloud.speech.client import _make_request_stream from google.cloud.speech.sample import Sample - stream = BytesIO(b'g' * (1599 * 2)) # Something bigger than a chunk. + stream = BytesIO(b'g' * (1599 * 4)) # Something bigger than a chunk. sample = Sample(stream=stream, encoding='LINEAR16') request_count = 0 @@ -47,4 +47,4 @@ def test_make_request_stream_short(self): request_count += 1 self.assertIsInstance(req, StreamingRecognizeRequest) - self.assertEqual(request_count, 3) + self.assertEqual(request_count, 5) diff --git a/speech/unit_tests/streaming/test_response.py b/speech/unit_tests/test_response.py similarity index 96% rename from speech/unit_tests/streaming/test_response.py rename to speech/unit_tests/test_response.py index d2a695d3f14d..413aa6b72871 100644 --- a/speech/unit_tests/streaming/test_response.py +++ b/speech/unit_tests/test_response.py @@ -17,7 +17,7 @@ class TestStreamingSpeechResponse(unittest.TestCase): def _getTargetClass(self): - from google.cloud.speech.streaming.response import ( + from google.cloud.speech.streaming_response import ( StreamingSpeechResponse) return StreamingSpeechResponse From 0a04d5f3509af51430679a28d7a3c99801efb4e2 Mon Sep 17 00:00:00 2001 From: Thomas Schultz Date: Tue, 18 Oct 2016 13:10:08 -0400 Subject: [PATCH 4/6] Feedback updates. --- docs/speech-streaming.rst | 8 ----- docs/speech-usage.rst | 36 ++++++++++--------- speech/google/cloud/speech/client.py | 36 ++++++++++--------- speech/google/cloud/speech/endpointer_type.py | 0 speech/google/cloud/speech/sample.py | 2 +- .../google/cloud/speech/streaming_response.py | 36 ++++++++++--------- .../google/cloud/speech/streaming_result.py | 2 +- speech/unit_tests/test_client.py | 17 +++++++-- speech/unit_tests/test_response.py | 2 +- 9 files changed, 76 insertions(+), 63 deletions(-) delete mode 100644 speech/google/cloud/speech/endpointer_type.py diff --git a/docs/speech-streaming.rst b/docs/speech-streaming.rst index eab505b4d06d..4a04b8ba79ae 100644 --- a/docs/speech-streaming.rst +++ b/docs/speech-streaming.rst @@ -13,11 +13,3 @@ Streaming Speech Result :members: :undoc-members: :show-inheritance: - -Streaming Endpointer Type -========================= - -.. automodule:: google.cloud.speech.endpointer_type - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/speech-usage.rst b/docs/speech-usage.rst index 8b649fae9dce..e1909057dfa2 100644 --- a/docs/speech-usage.rst +++ b/docs/speech-usage.rst @@ -155,41 +155,43 @@ data to possible text alternatives on the fly. .. code-block:: python - >>> import io >>> from google.cloud import speech >>> client = speech.Client() - >>> with io.open('./hello.wav', 'rb') as stream: + >>> with open('./hello.wav', 'rb') as stream: ... sample = client.sample(stream=stream, encoding=speech.Encoding.LINEAR16, ... sample_rate=16000) ... for response in client.stream_recognize(sample): ... print(response.transcript) - hello ... print(response.is_final) + hello True -By setting ``interim_results`` to true, interim results (tentative hypotheses) +By setting ``interim_results`` to :data:`True`, interim results (tentative hypotheses) may be returned as they become available (these interim results are indicated -with the is_final=false flag). If false or omitted, only is_final=true +with the ``is_final=false`` flag). If :data:`False` or omitted, only ``is_final=true`` result(s) are returned. .. code-block:: python - >>> import io >>> from google.cloud import speech >>> client = speech.Client() - >>> with io.open('./hello.wav', 'rb') as stream: - >>> sample = client.sample(stream=stream, encoding=speech.Encoding.LINEAR16, + >>> with open('./hello.wav', 'rb') as stream: + ... sample = client.sample(stream=stream, encoding=speech.Encoding.LINEAR16, ... sample_rate=16000) ... for response in client.stream_recognize(sample, ... interim_results=True): + ... print('====Response====') ... print(response.transcript) - hell ... print(response.is_final) + ====Response==== + he False - ... print(response.transcript) + ====Response==== + hell + False + ====Repsonse==== hello - ... print(response.is_final) True @@ -204,13 +206,15 @@ See: `Single Utterance`_ .. code-block:: python - >>> with io.open('./hello_pause_goodbye.wav', 'rb') as stream: - >>> sample = client.sample(stream=stream, encoding=speech.Encoding.LINEAR16, + >>> with open('./hello_pause_goodbye.wav', 'rb') as stream: + ... sample = client.sample(stream=stream, encoding=speech.Encoding.LINEAR16, ... sample_rate=16000) - ... stream_container = client.stream_recognize(sample, - ... single_utterance=True) - >>> print(stream_container.get_full_text()) + ... for response in client.stream_recognize(sample, + ... single_utterance=True): + ... print(response.transcript) + ... print(response.is_final) hello + True .. _Single Utterance: https://cloud.google.com/speech/reference/rpc/google.cloud.speech.v1beta1#streamingrecognitionconfig .. _sync_recognize: https://cloud.google.com/speech/reference/rest/v1beta1/speech/syncrecognize diff --git a/speech/google/cloud/speech/client.py b/speech/google/cloud/speech/client.py index 39b29a808761..7ffbf8e0ecfd 100644 --- a/speech/google/cloud/speech/client.py +++ b/speech/google/cloud/speech/client.py @@ -234,6 +234,9 @@ def stream_recognize(self, sample, language_code=None, See: https://cloud.google.com/speech/limits#content + Yields :class:`~streaming_response.StreamingSpeechResponse` containing + results and metadata from the streaming request. + :type sample: :class:`~google.cloud.speech.sample.Sample` :param sample: Instance of ``Sample`` containing audio information. @@ -264,8 +267,8 @@ def stream_recognize(self, sample, language_code=None, and phrases. This can also be used to add new words to the vocabulary of the recognizer. - :type single_utterance: boolean - :param single_utterance: [Optional] If false or omitted, the recognizer + :type single_utterance: bool + :param single_utterance: (Optional) If false or omitted, the recognizer will perform continuous recognition (continuing to process audio even if the user pauses speaking) until the client closes the @@ -282,13 +285,15 @@ def stream_recognize(self, sample, language_code=None, SpeechRecognitionResult with the is_final flag set to true. - :type interim_results: boolean - :param interim_results: [Optional] If true, interim results (tentative + :type interim_results: bool + :param interim_results: (Optional) If true, interim results (tentative hypotheses) may be returned as they become available (these interim results are indicated with the is_final=false flag). If false or omitted, only is_final=true result(s) are returned. + + :raises: :class:`EnvironmentError` if gRPC is not enabled. """ if not _USE_GAX: raise EnvironmentError('gRPC is required to use this API.') @@ -301,7 +306,7 @@ def stream_recognize(self, sample, language_code=None, interim_results=interim_results) for response in self.speech_api.streaming_recognize(requests): - if hasattr(response, 'results') or interim_results: + if getattr(response, 'results', None) or interim_results: yield StreamingSpeechResponse.from_pb(response) @property @@ -414,8 +419,8 @@ def _make_request_stream(sample, language_code=None, max_alternatives=None, and phrases. This can also be used to add new words to the vocabulary of the recognizer. - :type single_utterance: boolean - :param single_utterance: [Optional] If false or omitted, the recognizer + :type single_utterance: bool + :param single_utterance: (Optional) If false or omitted, the recognizer will perform continuous recognition (continuing to process audio even if the user pauses speaking) until the client closes the @@ -432,8 +437,8 @@ def _make_request_stream(sample, language_code=None, max_alternatives=None, SpeechRecognitionResult with the is_final flag set to true. - :type interim_results: boolean - :param interim_results: [Optional] If true, interim results (tentative + :type interim_results: bool + :param interim_results: (Optional) If true, interim results (tentative hypotheses) may be returned as they become available (these interim results are indicated with the is_final=false flag). If false or @@ -452,7 +457,6 @@ def _make_request_stream(sample, language_code=None, max_alternatives=None, data = sample.stream.read(sample.chunk_size) if not data: break - # Optimize the request data size to around 100ms. yield StreamingRecognizeRequest(audio_content=data) @@ -466,12 +470,12 @@ def _make_streaming_config(sample, language_code, :param sample: Instance of ``Sample`` containing audio information. :type language_code: str - :param language_code: (Optional) The language of the supplied audio as + :param language_code: The language of the supplied audio as BCP-47 language tag. Example: ``'en-GB'``. If omitted, defaults to ``'en-US'``. :type max_alternatives: int - :param max_alternatives: (Optional) Maximum number of recognition + :param max_alternatives: Maximum number of recognition hypotheses to be returned. The server may return fewer than maxAlternatives. Valid values are 0-30. A value of 0 or 1 @@ -492,8 +496,8 @@ def _make_streaming_config(sample, language_code, and phrases. This can also be used to add new words to the vocabulary of the recognizer. - :type single_utterance: boolean - :param single_utterance: [Optional] If false or omitted, the recognizer + :type single_utterance: bool + :param single_utterance: If false or omitted, the recognizer will perform continuous recognition (continuing to process audio even if the user pauses speaking) until the client closes the @@ -510,8 +514,8 @@ def _make_streaming_config(sample, language_code, SpeechRecognitionResult with the is_final flag set to true. - :type interim_results: boolean - :param interim_results: [Optional] If true, interim results (tentative + :type interim_results: bool + :param interim_results: If true, interim results (tentative hypotheses) may be returned as they become available (these interim results are indicated with the is_final=false flag). If false or diff --git a/speech/google/cloud/speech/endpointer_type.py b/speech/google/cloud/speech/endpointer_type.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/speech/google/cloud/speech/sample.py b/speech/google/cloud/speech/sample.py index fbc97adf1d0d..ef82f3a8f485 100644 --- a/speech/google/cloud/speech/sample.py +++ b/speech/google/cloud/speech/sample.py @@ -77,7 +77,7 @@ def chunk_size(self): :rtype: int :returns: Optimized chunk size. """ - return int(self.sample_rate / 10) + return int(self.sample_rate / 10.0) @property def source_uri(self): diff --git a/speech/google/cloud/speech/streaming_response.py b/speech/google/cloud/speech/streaming_response.py index 55e2321f1a1d..b74beba5840b 100644 --- a/speech/google/cloud/speech/streaming_response.py +++ b/speech/google/cloud/speech/streaming_response.py @@ -15,6 +15,7 @@ """Representation of a GAPIC Speech API response.""" from google.cloud.speech.streaming_result import StreamingSpeechResult +from google.cloud.gapic.speech.v1beta1.enums import StreamingRecognizeResponse class StreamingSpeechResponse(object): @@ -38,7 +39,7 @@ def __init__(self, error=None, endpointer_type=None, results=None, result_index=None): results = results or [] self._error = error - self._endpointer_type = EndpointerType.reverse_map.get( + self._endpointer_type = EndpointerType.REVERSE_MAP.get( endpointer_type, None) self._result_index = result_index self._results = [StreamingSpeechResult.from_pb(result) @@ -91,7 +92,7 @@ def is_final(self): :returns: True if the result has completed it's processing. """ if self.results: - return bool(self.results[0].is_final) + return self.results[0].is_final else: return False @@ -123,26 +124,27 @@ def transcript(self): if self.results and self.results[0].alternatives: return self.results[0].alternatives[0].transcript else: - return '' + return None -class EndpointerType(object): +class EndpointerType(StreamingRecognizeResponse.EndpointerType): """Endpointer type for tracking state of Speech API detection. + ENDPOINTER_EVENT_UNSPECIFIED (int): No endpointer event specified. + START_OF_SPEECH (int): Speech has been detected in the audio stream. + END_OF_SPEECH (int): Speech has ceased to be detected in the audio + stream. + END_OF_AUDIO (int): The end of the audio stream has been reached. and + it is being processed. + END_OF_UTTERANCE (int): This event is only sent when + ``single_utterance`` is ``true``. It indicates that the server has + detected the end of the user's speech utterance and expects no + additional speech. Therefore, the server will not process additional + audio. The client should stop sending additional audio data. + See: https://cloud.google.com/speech/reference/rpc/\ google.cloud.speech.v1beta1#endpointertype """ - ENDPOINTER_EVENT_UNSPECIFIED = 0 - START_OF_SPEECH = 1 - END_OF_SPEECH = 2 - END_OF_AUDIO = 3 - END_OF_UTTERANCE = 4 - - reverse_map = { - 0: 'ENDPOINTER_EVENT_UNSPECIFIED', - 1: 'START_OF_SPEECH', - 2: 'END_OF_SPEECH', - 3: 'END_OF_AUDIO', - 4: 'END_OF_UTTERANCE' - } + REVERSE_MAP = {v: k for k, v + in vars(StreamingRecognizeResponse.EndpointerType).items()} diff --git a/speech/google/cloud/speech/streaming_result.py b/speech/google/cloud/speech/streaming_result.py index 6cfc37c18ad5..104916eda9e0 100644 --- a/speech/google/cloud/speech/streaming_result.py +++ b/speech/google/cloud/speech/streaming_result.py @@ -70,4 +70,4 @@ def is_final(self): :rtype: bool :returns: True if the result has completed it's processing. """ - return bool(self._is_final) + return self._is_final diff --git a/speech/unit_tests/test_client.py b/speech/unit_tests/test_client.py index b9a436d1e22c..5fe8c3107fb7 100644 --- a/speech/unit_tests/test_client.py +++ b/speech/unit_tests/test_client.py @@ -251,19 +251,30 @@ def test_stream_recognize(self): sample = client.sample(stream=stream, encoding=Encoding.LINEAR16, sample_rate=self.SAMPLE_RATE) - results = client.stream_recognize(sample) + responses = client.stream_recognize(sample) - self.assertIsInstance(next(results), StreamingSpeechResponse) + self.assertIsInstance(next(responses), StreamingSpeechResponse) requests = [] for req in client.speech_api._requests: requests.append(req) self.assertEqual(len(requests), 2) +class _MockSpeechGAPICAlternative(object): + transcript = 'hello there!' + confidence = 0.9704365 + + +class _MockSpeechGAPICResult(object): + alternatives = [_MockSpeechGAPICAlternative()] + is_final = False + stability = 0.0 + + class _MockGAPICSpeechResponse(object): error = None endpointer_type = None - results = [] + results = [_MockSpeechGAPICResult()] result_index = 0 diff --git a/speech/unit_tests/test_response.py b/speech/unit_tests/test_response.py index 413aa6b72871..410d1caaa645 100644 --- a/speech/unit_tests/test_response.py +++ b/speech/unit_tests/test_response.py @@ -30,7 +30,7 @@ def test_ctor(self): self.assertEqual(response.confidence, 0.0) self.assertEqual(response.endpointer_type, None) self.assertEqual(response.results, []) - self.assertEqual(response.transcript, '') + self.assertEqual(response.transcript, None) self.assertFalse(response.is_final) def test_from_pb(self): From f6ff6548c527562a258120ceaa9f01df6af60011 Mon Sep 17 00:00:00 2001 From: Thomas Schultz Date: Wed, 19 Oct 2016 09:27:28 -0400 Subject: [PATCH 5/6] More verbose access to alternatives. --- speech/google/cloud/speech/client.py | 16 ++++++++-------- speech/google/cloud/speech/streaming_response.py | 6 ++++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/speech/google/cloud/speech/client.py b/speech/google/cloud/speech/client.py index 7ffbf8e0ecfd..b5622575b3c9 100644 --- a/speech/google/cloud/speech/client.py +++ b/speech/google/cloud/speech/client.py @@ -405,19 +405,19 @@ def _make_request_stream(sample, language_code=None, max_alternatives=None, will return a maximum of 1. Defaults to 1 :type profanity_filter: bool - :param profanity_filter: If True, the server will attempt to filter - out profanities, replacing all but the + :param profanity_filter: (Optional) If True, the server will attempt to + filter out profanities, replacing all but the initial character in each filtered word with asterisks, e.g. ``'f***'``. If False or omitted, profanities won't be filtered out. :type speech_context: list - :param speech_context: A list of strings (max 50) containing words and - phrases "hints" so that the speech recognition - is more likely to recognize them. This can be - used to improve the accuracy for specific words - and phrases. This can also be used to add new - words to the vocabulary of the recognizer. + :param speech_context: (Optional) A list of strings (max 50) containing + words and phrases "hints" so that the speech + recognition is more likely to recognize them. + This can be used to improve the accuracy for + specific words and phrases. This can also be used to + add new words to the vocabulary of the recognizer. :type single_utterance: bool :param single_utterance: (Optional) If false or omitted, the recognizer diff --git a/speech/google/cloud/speech/streaming_response.py b/speech/google/cloud/speech/streaming_response.py index b74beba5840b..6c64f46bf6dc 100644 --- a/speech/google/cloud/speech/streaming_response.py +++ b/speech/google/cloud/speech/streaming_response.py @@ -71,7 +71,8 @@ def confidence(self): :returns: Confidence score of recognized speech [0.0-1.0]. """ if self.results and self.results[0].alternatives: - return self.results[0].alternatives[0].confidence + top_alternative = self.results[0].alternatives[0] + return top_alternative.confidence else: return 0.0 @@ -122,7 +123,8 @@ def transcript(self): :returns: Transcript text from response. """ if self.results and self.results[0].alternatives: - return self.results[0].alternatives[0].transcript + top_alternative = self.results[0].alternatives[0] + return top_alternative.transcript else: return None From 4e536abea1b64a650e099b77ba845e3398de9cb1 Mon Sep 17 00:00:00 2001 From: Thomas Schultz Date: Fri, 21 Oct 2016 18:29:17 -0400 Subject: [PATCH 6/6] Remote response calculating confidence and transcript. --- .../google/cloud/speech/streaming_response.py | 61 +++---------------- speech/unit_tests/test_response.py | 7 +-- 2 files changed, 11 insertions(+), 57 deletions(-) diff --git a/speech/google/cloud/speech/streaming_response.py b/speech/google/cloud/speech/streaming_response.py index 6c64f46bf6dc..a720870c5e89 100644 --- a/speech/google/cloud/speech/streaming_response.py +++ b/speech/google/cloud/speech/streaming_response.py @@ -17,6 +17,11 @@ from google.cloud.speech.streaming_result import StreamingSpeechResult from google.cloud.gapic.speech.v1beta1.enums import StreamingRecognizeResponse +_REVERSE_MAP = { + value: key for key, value + in StreamingRecognizeResponse.EndpointerType.__dict__.items() + if not key.startswith('__')} + class StreamingSpeechResponse(object): """Representation of a Speech API protobuf streaming response. @@ -24,8 +29,8 @@ class StreamingSpeechResponse(object): :type error: :class:`google.grpc.Status` :param error: Instance of ``Status`` - :type endpointer_type: :class:`~EndpointerType` - :param endpointer_type: Enum of endpointer event. + :type endpointer_type: int + :param endpointer_type: Integer value of endpointer event. :type results: list of :class:`google.cloud.speech.v1beta1.StreamingRecognitionResult` @@ -39,8 +44,7 @@ def __init__(self, error=None, endpointer_type=None, results=None, result_index=None): results = results or [] self._error = error - self._endpointer_type = EndpointerType.REVERSE_MAP.get( - endpointer_type, None) + self._endpointer_type = _REVERSE_MAP.get(endpointer_type) self._result_index = result_index self._results = [StreamingSpeechResult.from_pb(result) for result in results] @@ -63,19 +67,6 @@ def from_pb(cls, pb_response): return cls(error=error, endpointer_type=endpointer_type, results=results, result_index=result_index) - @property - def confidence(self): - """Confidence score for recognized speech. - - :rtype: float - :returns: Confidence score of recognized speech [0.0-1.0]. - """ - if self.results and self.results[0].alternatives: - top_alternative = self.results[0].alternatives[0] - return top_alternative.confidence - else: - return 0.0 - @property def endpointer_type(self): """Endpointer indicating the state of the speech detection. @@ -114,39 +105,3 @@ def results(self): :returns: List of ``StreamingSpeechResult`` in this response. """ return self._results - - @property - def transcript(self): - """Get most likely transcript from response. - - :rtype: str - :returns: Transcript text from response. - """ - if self.results and self.results[0].alternatives: - top_alternative = self.results[0].alternatives[0] - return top_alternative.transcript - else: - return None - - -class EndpointerType(StreamingRecognizeResponse.EndpointerType): - """Endpointer type for tracking state of Speech API detection. - - ENDPOINTER_EVENT_UNSPECIFIED (int): No endpointer event specified. - START_OF_SPEECH (int): Speech has been detected in the audio stream. - END_OF_SPEECH (int): Speech has ceased to be detected in the audio - stream. - END_OF_AUDIO (int): The end of the audio stream has been reached. and - it is being processed. - END_OF_UTTERANCE (int): This event is only sent when - ``single_utterance`` is ``true``. It indicates that the server has - detected the end of the user's speech utterance and expects no - additional speech. Therefore, the server will not process additional - audio. The client should stop sending additional audio data. - - See: - https://cloud.google.com/speech/reference/rpc/\ - google.cloud.speech.v1beta1#endpointertype - """ - REVERSE_MAP = {v: k for k, v - in vars(StreamingRecognizeResponse.EndpointerType).items()} diff --git a/speech/unit_tests/test_response.py b/speech/unit_tests/test_response.py index 410d1caaa645..5a156b74d4b3 100644 --- a/speech/unit_tests/test_response.py +++ b/speech/unit_tests/test_response.py @@ -27,10 +27,8 @@ def _makeOne(self, *args, **kw): def test_ctor(self): response = self._makeOne({}, 'END_OF_UTTERANCE', [], 0) self.assertEqual(response.result_index, 0) - self.assertEqual(response.confidence, 0.0) self.assertEqual(response.endpointer_type, None) self.assertEqual(response.results, []) - self.assertEqual(response.transcript, None) self.assertFalse(response.is_final) def test_from_pb(self): @@ -38,8 +36,9 @@ def test_from_pb(self): res = response.from_pb(_MockSpeechPBResponse) self.assertFalse(res.is_final) self.assertEqual(res.endpointer_type, 'END_OF_AUDIO') - self.assertEqual(res.transcript, 'hello there!') - self.assertEqual(res.confidence, 0.9704365) + self.assertEqual(res.results[0].alternatives[0].transcript, + 'hello there!') + self.assertEqual(res.results[0].alternatives[0].confidence, 0.9704365) class _MockSpeechPBAlternative(object):