From 969fff56229209e704f573dd209b4085a6f31b8a Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 22 Sep 2023 11:53:47 +0800 Subject: [PATCH] Add VAD + Non-streaming ASR Python example. (#332) --- python-api-examples/README.md | 3 + .../vad-with-non-streaming-asr.py | 337 ++++++++++++++++++ 2 files changed, 340 insertions(+) create mode 100755 python-api-examples/vad-with-non-streaming-asr.py diff --git a/python-api-examples/README.md b/python-api-examples/README.md index a1e54a36a..24176bea0 100644 --- a/python-api-examples/README.md +++ b/python-api-examples/README.md @@ -7,3 +7,6 @@ - [vad-remove-non-speech-segments.py](./vad-remove-non-speech-segments.py) It uses [silero-vad](https://github.com/snakers4/silero-vad) to remove non-speech segments and concatenate all speech segments into a single one. +- [vad-with-non-streaming-asr.py](./vad-with-non-streaming-asr.py) It shows + how to use VAD with a non-streaming ASR model for speech recognition from + a microphone diff --git a/python-api-examples/vad-with-non-streaming-asr.py b/python-api-examples/vad-with-non-streaming-asr.py new file mode 100755 index 000000000..a73a1bab2 --- /dev/null +++ b/python-api-examples/vad-with-non-streaming-asr.py @@ -0,0 +1,337 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2023 Xiaomi Corporation + +""" +This file demonstrates how to use sherpa-onnx Python APIs +with VAD and non-streaming ASR models for speech recognition +from a microphone. + +Note that you need a non-streaming model for this script. + +(1) For paraformer + + ./python-api-examples/vad-with-non-streaming-asr.py \ + --silero-vad-model=/path/to/silero_vad.onnx \ + --tokens=/path/to/tokens.txt \ + --paraformer=/path/to/paraformer.onnx \ + --num-threads=2 \ + --decoding-method=greedy_search \ + --debug=false \ + --sample-rate=16000 \ + --feature-dim=80 + +(2) For transducer models from icefall + + ./python-api-examples/vad-with-non-streaming-asr.py \ + --silero-vad-model=/path/to/silero_vad.onnx \ + --tokens=/path/to/tokens.txt \ + --encoder=/path/to/encoder.onnx \ + --decoder=/path/to/decoder.onnx \ + --joiner=/path/to/joiner.onnx \ + --num-threads=2 \ + --decoding-method=greedy_search \ + --debug=false \ + --sample-rate=16000 \ + --feature-dim=80 + +(3) For Whisper models + +./python-api-examples/vad-with-non-streaming-asr.py \ + --silero-vad-model=/path/to/silero_vad.onnx \ + --whisper-encoder=./sherpa-onnx-whisper-base.en/base.en-encoder.int8.onnx \ + --whisper-decoder=./sherpa-onnx-whisper-base.en/base.en-decoder.int8.onnx \ + --tokens=./sherpa-onnx-whisper-base.en/base.en-tokens.txt \ + --whisper-task=transcribe \ + --num-threads=2 + +Please refer to +https://k2-fsa.github.io/sherpa/onnx/index.html +to install sherpa-onnx and to download non-streaming pre-trained models +used in this file. + +Please visit +https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx +to download silero_vad.onnx + +For instance, + +wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx +""" +import argparse +import sys +from pathlib import Path + +import numpy as np + +try: + import sounddevice as sd +except ImportError: + print("Please install sounddevice first. You can use") + print() + print(" pip install sounddevice") + print() + print("to install it") + sys.exit(-1) + +import sherpa_onnx + + +def get_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument( + "--silero-vad-model", + type=str, + required=True, + help="Path to silero_vad.onnx", + ) + + parser.add_argument( + "--tokens", + type=str, + help="Path to tokens.txt", + ) + + parser.add_argument( + "--encoder", + default="", + type=str, + help="Path to the transducer encoder model", + ) + + parser.add_argument( + "--decoder", + default="", + type=str, + help="Path to the transducer decoder model", + ) + + parser.add_argument( + "--joiner", + default="", + type=str, + help="Path to the transducer joiner model", + ) + + parser.add_argument( + "--paraformer", + default="", + type=str, + help="Path to the model.onnx from Paraformer", + ) + + parser.add_argument( + "--num-threads", + type=int, + default=1, + help="Number of threads for neural network computation", + ) + + parser.add_argument( + "--whisper-encoder", + default="", + type=str, + help="Path to whisper encoder model", + ) + + parser.add_argument( + "--whisper-decoder", + default="", + type=str, + help="Path to whisper decoder model", + ) + + parser.add_argument( + "--whisper-language", + default="", + type=str, + help="""It specifies the spoken language in the input file. + Example values: en, fr, de, zh, jp. + Available languages for multilingual models can be found at + https://github.com/openai/whisper/blob/main/whisper/tokenizer.py#L10 + If not specified, we infer the language from the input audio file. + """, + ) + + parser.add_argument( + "--whisper-task", + default="transcribe", + choices=["transcribe", "translate"], + type=str, + help="""For multilingual models, if you specify translate, the output + will be in English. + """, + ) + + parser.add_argument( + "--decoding-method", + type=str, + default="greedy_search", + help="""Valid values are greedy_search and modified_beam_search. + modified_beam_search is valid only for transducer models. + """, + ) + parser.add_argument( + "--debug", + type=bool, + default=False, + help="True to show debug messages when loading modes.", + ) + + parser.add_argument( + "--sample-rate", + type=int, + default=16000, + help="""Sample rate of the feature extractor. Must match the one + expected by the model.""", + ) + + parser.add_argument( + "--feature-dim", + type=int, + default=80, + help="Feature dimension. Must match the one expected by the model", + ) + + return parser.parse_args() + + +def assert_file_exists(filename: str): + assert Path(filename).is_file(), ( + f"{filename} does not exist!\n" + "Please refer to " + "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it" + ) + + +def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: + if args.encoder: + assert len(args.paraformer) == 0, args.paraformer + assert len(args.whisper_encoder) == 0, args.whisper_encoder + assert len(args.whisper_decoder) == 0, args.whisper_decoder + + assert_file_exists(args.encoder) + assert_file_exists(args.decoder) + assert_file_exists(args.joiner) + + recognizer = sherpa_onnx.OfflineRecognizer.from_transducer( + encoder=args.encoder, + decoder=args.decoder, + joiner=args.joiner, + tokens=args.tokens, + num_threads=args.num_threads, + sample_rate=args.sample_rate, + feature_dim=args.feature_dim, + decoding_method=args.decoding_method, + debug=args.debug, + ) + elif args.paraformer: + assert len(args.whisper_encoder) == 0, args.whisper_encoder + assert len(args.whisper_decoder) == 0, args.whisper_decoder + + assert_file_exists(args.paraformer) + + recognizer = sherpa_onnx.OfflineRecognizer.from_paraformer( + paraformer=args.paraformer, + tokens=args.tokens, + num_threads=args.num_threads, + sample_rate=args.sample_rate, + feature_dim=args.feature_dim, + decoding_method=args.decoding_method, + debug=args.debug, + ) + elif args.whisper_encoder: + assert_file_exists(args.whisper_encoder) + assert_file_exists(args.whisper_decoder) + + recognizer = sherpa_onnx.OfflineRecognizer.from_whisper( + encoder=args.whisper_encoder, + decoder=args.whisper_decoder, + tokens=args.tokens, + num_threads=args.num_threads, + decoding_method=args.decoding_method, + debug=args.debug, + language=args.whisper_language, + task=args.whisper_task, + ) + else: + raise ValueError("Please specify at least one model") + + return recognizer + + +def main(): + devices = sd.query_devices() + if len(devices) == 0: + print("No microphone devices found") + sys.exit(0) + + print(devices) + + # If you want to select a different input device, please use + # sd.default.device[0] = xxx + # where xxx is the device number + + default_input_device_idx = sd.default.device[0] + print(f'Use default device: {devices[default_input_device_idx]["name"]}') + + args = get_args() + assert_file_exists(args.tokens) + assert_file_exists(args.silero_vad_model) + + assert args.num_threads > 0, args.num_threads + + assert ( + args.sample_rate == 16000 + ), f"Only sample rate 16000 is supported.Given: {args.sample_rate}" + + print("Creating recognizer. Please wait...") + recognizer = create_recognizer(args) + + config = sherpa_onnx.VadModelConfig() + config.silero_vad.model = args.silero_vad_model + config.silero_vad.min_silence_duration = 0.25 + config.sample_rate = args.sample_rate + + window_size = config.silero_vad.window_size + + vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=100) + + samples_per_read = int(0.1 * args.sample_rate) # 0.1 second = 100 ms + + print("Started! Please speak") + + buffer = [] + texts = [] + with sd.InputStream(channels=1, dtype="float32", samplerate=args.sample_rate) as s: + while True: + samples, _ = s.read(samples_per_read) # a blocking read + samples = samples.reshape(-1) + + buffer = np.concatenate([buffer, samples]) + while len(buffer) > window_size: + vad.accept_waveform(buffer[:window_size]) + buffer = buffer[window_size:] + + while not vad.empty(): + stream = recognizer.create_stream() + stream.accept_waveform(args.sample_rate, vad.front.samples) + + vad.pop() + recognizer.decode_stream(stream) + + text = stream.result.text.strip().lower() + if len(text): + idx = len(texts) + texts.append(text) + print(f"{idx}: {text}") + + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + print("\nCaught Ctrl + C. Exiting")