Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add VAD + Non-streaming ASR Python example. #332

Merged
merged 1 commit into from
Sep 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions python-api-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@
- [vad-remove-non-speech-segments.py](./vad-remove-non-speech-segments.py) It uses
[silero-vad](https://github.com/snakers4/silero-vad) to remove non-speech
segments and concatenate all speech segments into a single one.
- [vad-with-non-streaming-asr.py](./vad-with-non-streaming-asr.py) It shows
how to use VAD with a non-streaming ASR model for speech recognition from
a microphone
337 changes: 337 additions & 0 deletions python-api-examples/vad-with-non-streaming-asr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,337 @@
#!/usr/bin/env python3
#
# Copyright (c) 2023 Xiaomi Corporation

"""
This file demonstrates how to use sherpa-onnx Python APIs
with VAD and non-streaming ASR models for speech recognition
from a microphone.

Note that you need a non-streaming model for this script.

(1) For paraformer

./python-api-examples/vad-with-non-streaming-asr.py \
--silero-vad-model=/path/to/silero_vad.onnx \
--tokens=/path/to/tokens.txt \
--paraformer=/path/to/paraformer.onnx \
--num-threads=2 \
--decoding-method=greedy_search \
--debug=false \
--sample-rate=16000 \
--feature-dim=80

(2) For transducer models from icefall

./python-api-examples/vad-with-non-streaming-asr.py \
--silero-vad-model=/path/to/silero_vad.onnx \
--tokens=/path/to/tokens.txt \
--encoder=/path/to/encoder.onnx \
--decoder=/path/to/decoder.onnx \
--joiner=/path/to/joiner.onnx \
--num-threads=2 \
--decoding-method=greedy_search \
--debug=false \
--sample-rate=16000 \
--feature-dim=80

(3) For Whisper models

./python-api-examples/vad-with-non-streaming-asr.py \
--silero-vad-model=/path/to/silero_vad.onnx \
--whisper-encoder=./sherpa-onnx-whisper-base.en/base.en-encoder.int8.onnx \
--whisper-decoder=./sherpa-onnx-whisper-base.en/base.en-decoder.int8.onnx \
--tokens=./sherpa-onnx-whisper-base.en/base.en-tokens.txt \
--whisper-task=transcribe \
--num-threads=2

Please refer to
https://k2-fsa.github.io/sherpa/onnx/index.html
to install sherpa-onnx and to download non-streaming pre-trained models
used in this file.

Please visit
https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
to download silero_vad.onnx

For instance,

wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
"""
import argparse
import sys
from pathlib import Path

import numpy as np

try:
import sounddevice as sd
except ImportError:
print("Please install sounddevice first. You can use")
print()
print(" pip install sounddevice")
print()
print("to install it")
sys.exit(-1)

import sherpa_onnx


def get_args():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)

parser.add_argument(
"--silero-vad-model",
type=str,
required=True,
help="Path to silero_vad.onnx",
)

parser.add_argument(
"--tokens",
type=str,
help="Path to tokens.txt",
)

parser.add_argument(
"--encoder",
default="",
type=str,
help="Path to the transducer encoder model",
)

parser.add_argument(
"--decoder",
default="",
type=str,
help="Path to the transducer decoder model",
)

parser.add_argument(
"--joiner",
default="",
type=str,
help="Path to the transducer joiner model",
)

parser.add_argument(
"--paraformer",
default="",
type=str,
help="Path to the model.onnx from Paraformer",
)

parser.add_argument(
"--num-threads",
type=int,
default=1,
help="Number of threads for neural network computation",
)

parser.add_argument(
"--whisper-encoder",
default="",
type=str,
help="Path to whisper encoder model",
)

parser.add_argument(
"--whisper-decoder",
default="",
type=str,
help="Path to whisper decoder model",
)

parser.add_argument(
"--whisper-language",
default="",
type=str,
help="""It specifies the spoken language in the input file.
Example values: en, fr, de, zh, jp.
Available languages for multilingual models can be found at
https://github.com/openai/whisper/blob/main/whisper/tokenizer.py#L10
If not specified, we infer the language from the input audio file.
""",
)

parser.add_argument(
"--whisper-task",
default="transcribe",
choices=["transcribe", "translate"],
type=str,
help="""For multilingual models, if you specify translate, the output
will be in English.
""",
)

parser.add_argument(
"--decoding-method",
type=str,
default="greedy_search",
help="""Valid values are greedy_search and modified_beam_search.
modified_beam_search is valid only for transducer models.
""",
)
parser.add_argument(
"--debug",
type=bool,
default=False,
help="True to show debug messages when loading modes.",
)

parser.add_argument(
"--sample-rate",
type=int,
default=16000,
help="""Sample rate of the feature extractor. Must match the one
expected by the model.""",
)

parser.add_argument(
"--feature-dim",
type=int,
default=80,
help="Feature dimension. Must match the one expected by the model",
)

return parser.parse_args()


def assert_file_exists(filename: str):
assert Path(filename).is_file(), (
f"{filename} does not exist!\n"
"Please refer to "
"https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
)


def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer:
if args.encoder:
assert len(args.paraformer) == 0, args.paraformer
assert len(args.whisper_encoder) == 0, args.whisper_encoder
assert len(args.whisper_decoder) == 0, args.whisper_decoder

assert_file_exists(args.encoder)
assert_file_exists(args.decoder)
assert_file_exists(args.joiner)

recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
encoder=args.encoder,
decoder=args.decoder,
joiner=args.joiner,
tokens=args.tokens,
num_threads=args.num_threads,
sample_rate=args.sample_rate,
feature_dim=args.feature_dim,
decoding_method=args.decoding_method,
debug=args.debug,
)
elif args.paraformer:
assert len(args.whisper_encoder) == 0, args.whisper_encoder
assert len(args.whisper_decoder) == 0, args.whisper_decoder

assert_file_exists(args.paraformer)

recognizer = sherpa_onnx.OfflineRecognizer.from_paraformer(
paraformer=args.paraformer,
tokens=args.tokens,
num_threads=args.num_threads,
sample_rate=args.sample_rate,
feature_dim=args.feature_dim,
decoding_method=args.decoding_method,
debug=args.debug,
)
elif args.whisper_encoder:
assert_file_exists(args.whisper_encoder)
assert_file_exists(args.whisper_decoder)

recognizer = sherpa_onnx.OfflineRecognizer.from_whisper(
encoder=args.whisper_encoder,
decoder=args.whisper_decoder,
tokens=args.tokens,
num_threads=args.num_threads,
decoding_method=args.decoding_method,
debug=args.debug,
language=args.whisper_language,
task=args.whisper_task,
)
else:
raise ValueError("Please specify at least one model")

return recognizer


def main():
devices = sd.query_devices()
if len(devices) == 0:
print("No microphone devices found")
sys.exit(0)

print(devices)

# If you want to select a different input device, please use
# sd.default.device[0] = xxx
# where xxx is the device number

default_input_device_idx = sd.default.device[0]
print(f'Use default device: {devices[default_input_device_idx]["name"]}')

args = get_args()
assert_file_exists(args.tokens)
assert_file_exists(args.silero_vad_model)

assert args.num_threads > 0, args.num_threads

assert (
args.sample_rate == 16000
), f"Only sample rate 16000 is supported.Given: {args.sample_rate}"

print("Creating recognizer. Please wait...")
recognizer = create_recognizer(args)

config = sherpa_onnx.VadModelConfig()
config.silero_vad.model = args.silero_vad_model
config.silero_vad.min_silence_duration = 0.25
config.sample_rate = args.sample_rate

window_size = config.silero_vad.window_size

vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=100)

samples_per_read = int(0.1 * args.sample_rate) # 0.1 second = 100 ms

print("Started! Please speak")

buffer = []
texts = []
with sd.InputStream(channels=1, dtype="float32", samplerate=args.sample_rate) as s:
while True:
samples, _ = s.read(samples_per_read) # a blocking read
samples = samples.reshape(-1)

buffer = np.concatenate([buffer, samples])
while len(buffer) > window_size:
vad.accept_waveform(buffer[:window_size])
buffer = buffer[window_size:]

while not vad.empty():
stream = recognizer.create_stream()
stream.accept_waveform(args.sample_rate, vad.front.samples)

vad.pop()
recognizer.decode_stream(stream)

text = stream.result.text.strip().lower()
if len(text):
idx = len(texts)
texts.append(text)
print(f"{idx}: {text}")


if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("\nCaught Ctrl + C. Exiting")