From 77e832db7a613ccc29155b63d4206b9c072eab26 Mon Sep 17 00:00:00 2001 From: Shahin Date: Fri, 20 Jul 2018 16:20:29 -0700 Subject: [PATCH] Diarization Output Modified [(#1586)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1586) * Printing the last paragraph only. * Python3 print * Removing sample rate setting * Adding the missing output parameter in the example * Changes based on the comments * Removed filenames as input parameters * Removed unused args * Updated README file * Updated the inline comment * Modified code to make it more readable * Simplified the response object processing. * Fixing the long line issue. --- speech/snippets/README.rst | 19 +++--- speech/snippets/beta_snippets.py | 97 ++++++++++++--------------- speech/snippets/beta_snippets_test.py | 23 +++---- 3 files changed, 60 insertions(+), 79 deletions(-) diff --git a/speech/snippets/README.rst b/speech/snippets/README.rst index 394c97c81f2f..b6680c880f46 100644 --- a/speech/snippets/README.rst +++ b/speech/snippets/README.rst @@ -221,25 +221,22 @@ To run this sample: $ python beta_snippets.py - usage: beta_snippets.py [-h] command path first second + usage: beta_snippets.py [-h] command Google Cloud Speech API sample that demonstrates enhanced models and recognition metadata. Example usage: - python beta_snippets.py enhanced-model resources/commercial_mono.wav - python beta_snippets.py metadata resources/commercial_mono.wav - python beta_snippets.py punctuation resources/commercial_mono.wav - python beta_snippets.py diarization resources/commercial_mono.wav - python beta_snippets.py multi-channel resources/commercial_mono.wav - python beta_snippets.py multi-language resources/multi.wav en-US es - python beta_snippets.py word-level-conf resources/commercial_mono.wav + python beta_snippets.py enhanced-model + python beta_snippets.py metadata + python beta_snippets.py punctuation + python beta_snippets.py diarization + python beta_snippets.py multi-channel + python beta_snippets.py multi-language + python beta_snippets.py word-level-conf positional arguments: command - path File for audio file to be recognized - first First language in audio file to be recognized - second Second language in audio file to be recognized optional arguments: -h, --help show this help message and exit diff --git a/speech/snippets/beta_snippets.py b/speech/snippets/beta_snippets.py index 24e213be3565..30ca9cde84be 100644 --- a/speech/snippets/beta_snippets.py +++ b/speech/snippets/beta_snippets.py @@ -18,27 +18,26 @@ and recognition metadata. Example usage: - python beta_snippets.py enhanced-model resources/commercial_mono.wav - python beta_snippets.py metadata resources/commercial_mono.wav - python beta_snippets.py punctuation resources/commercial_mono.wav - python beta_snippets.py diarization resources/commercial_mono.wav - python beta_snippets.py multi-channel resources/commercial_mono.wav - python beta_snippets.py multi-language resources/multi.wav en-US es - python beta_snippets.py word-level-conf resources/commercial_mono.wav + python beta_snippets.py enhanced-model + python beta_snippets.py metadata + python beta_snippets.py punctuation + python beta_snippets.py diarization + python beta_snippets.py multi-channel + python beta_snippets.py multi-language + python beta_snippets.py word-level-conf """ import argparse import io -def transcribe_file_with_enhanced_model(speech_file): +def transcribe_file_with_enhanced_model(): """Transcribe the given audio file using an enhanced model.""" # [START speech_transcribe_file_with_enhanced_model] from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() - # TODO(developer): Uncomment and set to a path to your audio file. - # speech_file = 'path/to/file.wav' + speech_file = 'resources/commercial_mono.wav' with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() @@ -64,14 +63,13 @@ def transcribe_file_with_enhanced_model(speech_file): # [END speech_transcribe_file_with_enhanced_model] -def transcribe_file_with_metadata(speech_file): +def transcribe_file_with_metadata(): """Send a request that includes recognition metadata.""" # [START speech_transcribe_file_with_metadata] from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() - # TODO(developer): Uncomment and set to a path to your audio file. - # speech_file = 'path/to/file.wav' + speech_file = 'resources/commercial_mono.wav' with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() @@ -110,14 +108,13 @@ def transcribe_file_with_metadata(speech_file): # [END speech_transcribe_file_with_metadata] -def transcribe_file_with_auto_punctuation(speech_file): +def transcribe_file_with_auto_punctuation(): """Transcribe the given audio file with auto punctuation enabled.""" # [START speech_transcribe_file_with_auto_punctuation] from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() - # TODO(developer): Uncomment and set to a path to your audio file. - # speech_file = 'path/to/file.wav' + speech_file = 'resources/commercial_mono.wav' with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() @@ -140,14 +137,13 @@ def transcribe_file_with_auto_punctuation(speech_file): # [END speech_transcribe_file_with_auto_punctuation] -def transcribe_file_with_diarization(speech_file): +def transcribe_file_with_diarization(): """Transcribe the given audio file synchronously with diarization.""" # [START speech_transcribe_diarization] from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() - # TODO(developer): Uncomment and set to a path to your audio file. - # speech_file = 'path/to/file.wav' + speech_file = 'resources/commercial_mono.wav' with open(speech_file, 'rb') as audio_file: content = audio_file.read() @@ -156,7 +152,7 @@ def transcribe_file_with_diarization(speech_file): config = speech.types.RecognitionConfig( encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, - sample_rate_hertz=16000, + sample_rate_hertz=8000, language_code='en-US', enable_speaker_diarization=True, diarization_speaker_count=2) @@ -164,25 +160,29 @@ def transcribe_file_with_diarization(speech_file): print('Waiting for operation to complete...') response = client.recognize(config, audio) - for i, result in enumerate(response.results): - alternative = result.alternatives[0] - print('-' * 20) - print('First alternative of result {}: {}' - .format(i, alternative.transcript)) - print('Speaker Tag for the first word: {}' - .format(alternative.words[0].speaker_tag)) + # The transcript within each result is separate and sequential per result. + # However, the words list within an alternative includes all the words + # from all the results thus far. Thus, to get all the words with speaker + # tags, you only have to take the words list from the last result: + result = response.results[-1] + + words_info = result.alternatives[0].words + + # Printing out the output: + for word_info in words_info: + print("word: '{}', speaker_tag: {}".format(word_info.word, + word_info.speaker_tag)) # [END speech_transcribe_diarization] -def transcribe_file_with_multichannel(speech_file): +def transcribe_file_with_multichannel(): """Transcribe the given audio file synchronously with multi channel.""" # [START speech_transcribe_multichannel] from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() - # TODO(developer): Uncomment and set to a path to your audio file. - # speech_file = 'path/to/file.wav' + speech_file = 'resources/Google_Gnome.wav' with open(speech_file, 'rb') as audio_file: content = audio_file.read() @@ -207,17 +207,16 @@ def transcribe_file_with_multichannel(speech_file): # [END speech_transcribe_multichannel] -def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang): +def transcribe_file_with_multilanguage(): """Transcribe the given audio file synchronously with multi language.""" # [START speech_transcribe_multilanguage] from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() - # TODO(developer): Uncomment and set to a path to your audio file. - # speech_file = 'path/to/file.wav' - # first_lang = first language code, e,g, 'en-US' - # second_lang = first language code, e,g, 'es' + speech_file = 'resources/multi.wav' + first_lang = 'en-US' + second_lang = 'es' with open(speech_file, 'rb') as audio_file: content = audio_file.read() @@ -226,6 +225,7 @@ def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang): config = speech.types.RecognitionConfig( encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=44100, audio_channel_count=2, language_code=first_lang, alternative_language_codes=[second_lang]) @@ -241,15 +241,14 @@ def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang): # [END speech_transcribe_multilanguage] -def transcribe_file_with_word_level_confidence(speech_file): +def transcribe_file_with_word_level_confidence(): """Transcribe the given audio file synchronously with word level confidence.""" # [START speech_transcribe_word_level_confidence] from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() - # TODO(developer): Uncomment and set to a path to your audio file. - # speech_file = 'path/to/file.wav' + speech_file = 'resources/Google_Gnome.wav' with open(speech_file, 'rb') as audio_file: content = audio_file.read() @@ -279,28 +278,20 @@ def transcribe_file_with_word_level_confidence(speech_file): description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('command') - parser.add_argument( - 'path', help='File for audio file to be recognized') - parser.add_argument( - 'first', help='First language in audio file to be recognized', - nargs='?') - parser.add_argument( - 'second', help='Second language in audio file to be recognized', - nargs='?') args = parser.parse_args() if args.command == 'enhanced-model': - transcribe_file_with_enhanced_model(args.path) + transcribe_file_with_enhanced_model() elif args.command == 'metadata': - transcribe_file_with_metadata(args.path) + transcribe_file_with_metadata() elif args.command == 'punctuation': - transcribe_file_with_auto_punctuation(args.path) + transcribe_file_with_auto_punctuation() elif args.command == 'diarization': - transcribe_file_with_diarization(args.path) + transcribe_file_with_diarization() elif args.command == 'multi-channel': - transcribe_file_with_multichannel(args.path) + transcribe_file_with_multichannel() elif args.command == 'multi-language': - transcribe_file_with_multilanguage(args.path, args.first, args.second) + transcribe_file_with_multilanguage() elif args.command == 'word-level-conf': - transcribe_file_with_word_level_confidence(args.path) + transcribe_file_with_word_level_confidence() diff --git a/speech/snippets/beta_snippets_test.py b/speech/snippets/beta_snippets_test.py index bbb6c75f674f..5720da420d5d 100644 --- a/speech/snippets/beta_snippets_test.py +++ b/speech/snippets/beta_snippets_test.py @@ -26,56 +26,49 @@ def test_transcribe_file_with_enhanced_model(capsys): - transcribe_file_with_enhanced_model( - os.path.join(RESOURCES, 'commercial_mono.wav')) + transcribe_file_with_enhanced_model() out, _ = capsys.readouterr() assert 'Chrome' in out def test_transcribe_file_with_metadata(capsys): - transcribe_file_with_metadata( - os.path.join(RESOURCES, 'commercial_mono.wav')) + transcribe_file_with_metadata() out, _ = capsys.readouterr() assert 'Chrome' in out def test_transcribe_file_with_auto_punctuation(capsys): - transcribe_file_with_auto_punctuation( - os.path.join(RESOURCES, 'commercial_mono.wav')) + transcribe_file_with_auto_punctuation() out, _ = capsys.readouterr() assert 'Okay. Sure.' in out def test_transcribe_diarization(capsys): - transcribe_file_with_diarization( - os.path.join(RESOURCES, 'Google_Gnome.wav')) + transcribe_file_with_diarization() out, err = capsys.readouterr() - assert 'OK Google stream stranger things from Netflix to my TV' in out + assert "word: 'here', speaker_tag: 1" in out def test_transcribe_multichannel_file(capsys): - transcribe_file_with_multichannel( - os.path.join(RESOURCES, 'Google_Gnome.wav')) + transcribe_file_with_multichannel() out, err = capsys.readouterr() assert 'OK Google stream stranger things from Netflix to my TV' in out def test_transcribe_multilanguage_file(capsys): - transcribe_file_with_multilanguage( - os.path.join(RESOURCES, 'multi.wav'), 'en-US', 'es') + transcribe_file_with_multilanguage() out, err = capsys.readouterr() assert 'how are you doing estoy bien e tu' in out def test_transcribe_word_level_confidence(capsys): - transcribe_file_with_word_level_confidence( - os.path.join(RESOURCES, 'Google_Gnome.wav')) + transcribe_file_with_word_level_confidence() out, err = capsys.readouterr() assert 'OK Google stream stranger things from Netflix to my TV' in out