Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Diarization Output Modified #1586

Merged
merged 13 commits into from
Jul 20, 2018
19 changes: 8 additions & 11 deletions speech/cloud-client/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -221,25 +221,22 @@ To run this sample:

$ python beta_snippets.py

usage: beta_snippets.py [-h] command path first second
usage: beta_snippets.py [-h] command

Google Cloud Speech API sample that demonstrates enhanced models
and recognition metadata.

Example usage:
python beta_snippets.py enhanced-model resources/commercial_mono.wav
python beta_snippets.py metadata resources/commercial_mono.wav
python beta_snippets.py punctuation resources/commercial_mono.wav
python beta_snippets.py diarization resources/commercial_mono.wav
python beta_snippets.py multi-channel resources/commercial_mono.wav
python beta_snippets.py multi-language resources/multi.wav en-US es
python beta_snippets.py word-level-conf resources/commercial_mono.wav
python beta_snippets.py enhanced-model
python beta_snippets.py metadata
python beta_snippets.py punctuation
python beta_snippets.py diarization
python beta_snippets.py multi-channel
python beta_snippets.py multi-language
python beta_snippets.py word-level-conf

positional arguments:
command
path File for audio file to be recognized
first First language in audio file to be recognized
second Second language in audio file to be recognized

optional arguments:
-h, --help show this help message and exit
Expand Down
97 changes: 44 additions & 53 deletions speech/cloud-client/beta_snippets.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,27 +18,26 @@
and recognition metadata.

Example usage:
python beta_snippets.py enhanced-model resources/commercial_mono.wav
python beta_snippets.py metadata resources/commercial_mono.wav
python beta_snippets.py punctuation resources/commercial_mono.wav
python beta_snippets.py diarization resources/commercial_mono.wav
python beta_snippets.py multi-channel resources/commercial_mono.wav
python beta_snippets.py multi-language resources/multi.wav en-US es
python beta_snippets.py word-level-conf resources/commercial_mono.wav
python beta_snippets.py enhanced-model
python beta_snippets.py metadata
python beta_snippets.py punctuation
python beta_snippets.py diarization
python beta_snippets.py multi-channel
python beta_snippets.py multi-language
python beta_snippets.py word-level-conf
"""

import argparse
import io


def transcribe_file_with_enhanced_model(speech_file):
def transcribe_file_with_enhanced_model():
"""Transcribe the given audio file using an enhanced model."""
# [START speech_transcribe_file_with_enhanced_model]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()

# TODO(developer): Uncomment and set to a path to your audio file.
# speech_file = 'path/to/file.wav'
speech_file = 'resources/commercial_mono.wav'

with io.open(speech_file, 'rb') as audio_file:
content = audio_file.read()
Expand All @@ -64,14 +63,13 @@ def transcribe_file_with_enhanced_model(speech_file):
# [END speech_transcribe_file_with_enhanced_model]


def transcribe_file_with_metadata(speech_file):
def transcribe_file_with_metadata():
"""Send a request that includes recognition metadata."""
# [START speech_transcribe_file_with_metadata]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()

# TODO(developer): Uncomment and set to a path to your audio file.
# speech_file = 'path/to/file.wav'
speech_file = 'resources/commercial_mono.wav'

with io.open(speech_file, 'rb') as audio_file:
content = audio_file.read()
Expand Down Expand Up @@ -110,14 +108,13 @@ def transcribe_file_with_metadata(speech_file):
# [END speech_transcribe_file_with_metadata]


def transcribe_file_with_auto_punctuation(speech_file):
def transcribe_file_with_auto_punctuation():
"""Transcribe the given audio file with auto punctuation enabled."""
# [START speech_transcribe_file_with_auto_punctuation]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()

# TODO(developer): Uncomment and set to a path to your audio file.
# speech_file = 'path/to/file.wav'
speech_file = 'resources/commercial_mono.wav'

with io.open(speech_file, 'rb') as audio_file:
content = audio_file.read()
Expand All @@ -140,14 +137,13 @@ def transcribe_file_with_auto_punctuation(speech_file):
# [END speech_transcribe_file_with_auto_punctuation]


def transcribe_file_with_diarization(speech_file):
def transcribe_file_with_diarization():
"""Transcribe the given audio file synchronously with diarization."""
# [START speech_transcribe_diarization]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()

# TODO(developer): Uncomment and set to a path to your audio file.
# speech_file = 'path/to/file.wav'
speech_file = 'resources/commercial_mono.wav'

with open(speech_file, 'rb') as audio_file:
content = audio_file.read()
Expand All @@ -156,33 +152,37 @@ def transcribe_file_with_diarization(speech_file):

config = speech.types.RecognitionConfig(
encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
sample_rate_hertz=8000,
language_code='en-US',
enable_speaker_diarization=True,
diarization_speaker_count=2)

print('Waiting for operation to complete...')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider using python logging facility. Understandably, for this sample it might be overkill so take it or leave it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW nearly all our other Python samples do print(). It's true that it's not always the recommended practice in production, but it's easy to understand. With logging there's always the risk that the developer has some weird config where the logs end up where maybe they don't expect.

response = client.recognize(config, audio)

for i, result in enumerate(response.results):
alternative = result.alternatives[0]
print('-' * 20)
print('First alternative of result {}: {}'
.format(i, alternative.transcript))
print('Speaker Tag for the first word: {}'
.format(alternative.words[0].speaker_tag))
# The transcript within each result is separate and sequential per result.
# However, the words list within an alternative includes all the words
# from all the results thus far. Thus, to get all the words with speaker
# tags, you only have to take the words list from the last result:
result = response.results[-1]

words_info = result.alternatives[0].words

# Printing out the output:
for word_info in words_info:
print("word: '{}', speaker_tag: {}".format(word_info.word,
word_info.speaker_tag))
# [END speech_transcribe_diarization]


def transcribe_file_with_multichannel(speech_file):
def transcribe_file_with_multichannel():
"""Transcribe the given audio file synchronously with
multi channel."""
# [START speech_transcribe_multichannel]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()

# TODO(developer): Uncomment and set to a path to your audio file.
# speech_file = 'path/to/file.wav'
speech_file = 'resources/Google_Gnome.wav'

with open(speech_file, 'rb') as audio_file:
content = audio_file.read()
Expand All @@ -207,17 +207,16 @@ def transcribe_file_with_multichannel(speech_file):
# [END speech_transcribe_multichannel]


def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang):
def transcribe_file_with_multilanguage():
"""Transcribe the given audio file synchronously with
multi language."""
# [START speech_transcribe_multilanguage]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()

# TODO(developer): Uncomment and set to a path to your audio file.
# speech_file = 'path/to/file.wav'
# first_lang = first language code, e,g, 'en-US'
# second_lang = first language code, e,g, 'es'
speech_file = 'resources/multi.wav'
first_lang = 'en-US'
second_lang = 'es'

with open(speech_file, 'rb') as audio_file:
content = audio_file.read()
Expand All @@ -226,6 +225,7 @@ def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang):

config = speech.types.RecognitionConfig(
encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=44100,
audio_channel_count=2,
language_code=first_lang,
alternative_language_codes=[second_lang])
Expand All @@ -241,15 +241,14 @@ def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang):
# [END speech_transcribe_multilanguage]


def transcribe_file_with_word_level_confidence(speech_file):
def transcribe_file_with_word_level_confidence():
"""Transcribe the given audio file synchronously with
word level confidence."""
# [START speech_transcribe_word_level_confidence]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()

# TODO(developer): Uncomment and set to a path to your audio file.
# speech_file = 'path/to/file.wav'
speech_file = 'resources/Google_Gnome.wav'

with open(speech_file, 'rb') as audio_file:
content = audio_file.read()
Expand Down Expand Up @@ -279,28 +278,20 @@ def transcribe_file_with_word_level_confidence(speech_file):
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('command')
parser.add_argument(
'path', help='File for audio file to be recognized')
parser.add_argument(
'first', help='First language in audio file to be recognized',
nargs='?')
parser.add_argument(
'second', help='Second language in audio file to be recognized',
nargs='?')

args = parser.parse_args()

if args.command == 'enhanced-model':
transcribe_file_with_enhanced_model(args.path)
transcribe_file_with_enhanced_model()
elif args.command == 'metadata':
transcribe_file_with_metadata(args.path)
transcribe_file_with_metadata()
elif args.command == 'punctuation':
transcribe_file_with_auto_punctuation(args.path)
transcribe_file_with_auto_punctuation()
elif args.command == 'diarization':
transcribe_file_with_diarization(args.path)
transcribe_file_with_diarization()
elif args.command == 'multi-channel':
transcribe_file_with_multichannel(args.path)
transcribe_file_with_multichannel()
elif args.command == 'multi-language':
transcribe_file_with_multilanguage(args.path, args.first, args.second)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For future reference, argparse's sub-commands feature would be helpful to avoid having args that only matter for one command or another.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good.

transcribe_file_with_multilanguage()
elif args.command == 'word-level-conf':
transcribe_file_with_word_level_confidence(args.path)
transcribe_file_with_word_level_confidence()
23 changes: 8 additions & 15 deletions speech/cloud-client/beta_snippets_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,56 +26,49 @@


def test_transcribe_file_with_enhanced_model(capsys):
transcribe_file_with_enhanced_model(
os.path.join(RESOURCES, 'commercial_mono.wav'))
transcribe_file_with_enhanced_model()
out, _ = capsys.readouterr()

assert 'Chrome' in out


def test_transcribe_file_with_metadata(capsys):
transcribe_file_with_metadata(
os.path.join(RESOURCES, 'commercial_mono.wav'))
transcribe_file_with_metadata()
out, _ = capsys.readouterr()

assert 'Chrome' in out


def test_transcribe_file_with_auto_punctuation(capsys):
transcribe_file_with_auto_punctuation(
os.path.join(RESOURCES, 'commercial_mono.wav'))
transcribe_file_with_auto_punctuation()
out, _ = capsys.readouterr()

assert 'Okay. Sure.' in out


def test_transcribe_diarization(capsys):
transcribe_file_with_diarization(
os.path.join(RESOURCES, 'Google_Gnome.wav'))
transcribe_file_with_diarization()
out, err = capsys.readouterr()

assert 'OK Google stream stranger things from Netflix to my TV' in out
assert "word: 'here', speaker_tag: 1" in out


def test_transcribe_multichannel_file(capsys):
transcribe_file_with_multichannel(
os.path.join(RESOURCES, 'Google_Gnome.wav'))
transcribe_file_with_multichannel()
out, err = capsys.readouterr()

assert 'OK Google stream stranger things from Netflix to my TV' in out


def test_transcribe_multilanguage_file(capsys):
transcribe_file_with_multilanguage(
os.path.join(RESOURCES, 'multi.wav'), 'en-US', 'es')
transcribe_file_with_multilanguage()
out, err = capsys.readouterr()

assert 'how are you doing estoy bien e tu' in out


def test_transcribe_word_level_confidence(capsys):
transcribe_file_with_word_level_confidence(
os.path.join(RESOURCES, 'Google_Gnome.wav'))
transcribe_file_with_word_level_confidence()
out, err = capsys.readouterr()

assert 'OK Google stream stranger things from Netflix to my TV' in out
2 changes: 1 addition & 1 deletion texttospeech/cloud-client/audio_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

Example usage:
python audio_profile.py --text "hello" --effects_profile_id
"telephony-class-application"
"telephony-class-application" --output "output.mp3"
"""

import argparse
Expand Down