Add VAD + Non-streaming ASR Python example. (#332)

k2-fsa · Sep 22, 2023 · 969fff5 · 969fff5
1 parent cf199ad
commit 969fff5
Show file tree

Hide file tree

Showing 2 changed files with 340 additions and 0 deletions.
diff --git a/python-api-examples/README.md b/python-api-examples/README.md
@@ -7,3 +7,6 @@
 - [vad-remove-non-speech-segments.py](./vad-remove-non-speech-segments.py) It uses
   [silero-vad](https://github.com/snakers4/silero-vad) to remove non-speech
   segments and concatenate all speech segments into a single one.
+- [vad-with-non-streaming-asr.py](./vad-with-non-streaming-asr.py) It shows
+  how to use VAD with a non-streaming ASR model for speech recognition from
+  a microphone
diff --git a/python-api-examples/vad-with-non-streaming-asr.py b/python-api-examples/vad-with-non-streaming-asr.py
@@ -0,0 +1,337 @@
+#!/usr/bin/env python3
+#
+# Copyright (c)  2023  Xiaomi Corporation
+
+"""
+This file demonstrates how to use sherpa-onnx Python APIs
+with VAD and non-streaming ASR models for speech recognition
+from a microphone.
+
+Note that you need a non-streaming model for this script.
+
+(1) For paraformer
+
+    ./python-api-examples/vad-with-non-streaming-asr.py  \
+      --silero-vad-model=/path/to/silero_vad.onnx \
+      --tokens=/path/to/tokens.txt \
+      --paraformer=/path/to/paraformer.onnx \
+      --num-threads=2 \
+      --decoding-method=greedy_search \
+      --debug=false \
+      --sample-rate=16000 \
+      --feature-dim=80
+
+(2) For transducer models from icefall
+
+    ./python-api-examples/vad-with-non-streaming-asr.py  \
+      --silero-vad-model=/path/to/silero_vad.onnx \
+      --tokens=/path/to/tokens.txt \
+      --encoder=/path/to/encoder.onnx \
+      --decoder=/path/to/decoder.onnx \
+      --joiner=/path/to/joiner.onnx \
+      --num-threads=2 \
+      --decoding-method=greedy_search \
+      --debug=false \
+      --sample-rate=16000 \
+      --feature-dim=80
+
+(3) For Whisper models
+
+./python-api-examples/vad-with-non-streaming-asr.py  \
+  --silero-vad-model=/path/to/silero_vad.onnx \
+  --whisper-encoder=./sherpa-onnx-whisper-base.en/base.en-encoder.int8.onnx \
+  --whisper-decoder=./sherpa-onnx-whisper-base.en/base.en-decoder.int8.onnx \
+  --tokens=./sherpa-onnx-whisper-base.en/base.en-tokens.txt \
+  --whisper-task=transcribe \
+  --num-threads=2
+
+Please refer to
+https://k2-fsa.github.io/sherpa/onnx/index.html
+to install sherpa-onnx and to download non-streaming pre-trained models
+used in this file.
+
+Please visit
+https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
+to download silero_vad.onnx
+
+For instance,
+
+wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
+"""
+import argparse
+import sys
+from pathlib import Path
+
+import numpy as np
+
+try:
+    import sounddevice as sd
+except ImportError:
+    print("Please install sounddevice first. You can use")
+    print()
+    print("  pip install sounddevice")
+    print()
+    print("to install it")
+    sys.exit(-1)
+
+import sherpa_onnx
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--silero-vad-model",
+        type=str,
+        required=True,
+        help="Path to silero_vad.onnx",
+    )
+
+    parser.add_argument(
+        "--tokens",
+        type=str,
+        help="Path to tokens.txt",
+    )
+
+    parser.add_argument(
+        "--encoder",
+        default="",
+        type=str,
+        help="Path to the transducer encoder model",
+    )
+
+    parser.add_argument(
+        "--decoder",
+        default="",
+        type=str,
+        help="Path to the transducer decoder model",
+    )
+
+    parser.add_argument(
+        "--joiner",
+        default="",
+        type=str,
+        help="Path to the transducer joiner model",
+    )
+
+    parser.add_argument(
+        "--paraformer",
+        default="",
+        type=str,
+        help="Path to the model.onnx from Paraformer",
+    )
+
+    parser.add_argument(
+        "--num-threads",
+        type=int,
+        default=1,
+        help="Number of threads for neural network computation",
+    )
+
+    parser.add_argument(
+        "--whisper-encoder",
+        default="",
+        type=str,
+        help="Path to whisper encoder model",
+    )
+
+    parser.add_argument(
+        "--whisper-decoder",
+        default="",
+        type=str,
+        help="Path to whisper decoder model",
+    )
+
+    parser.add_argument(
+        "--whisper-language",
+        default="",
+        type=str,
+        help="""It specifies the spoken language in the input file.
+        Example values: en, fr, de, zh, jp.
+        Available languages for multilingual models can be found at
+        https://github.com/openai/whisper/blob/main/whisper/tokenizer.py#L10
+        If not specified, we infer the language from the input audio file.
+        """,
+    )
+
+    parser.add_argument(
+        "--whisper-task",
+        default="transcribe",
+        choices=["transcribe", "translate"],
+        type=str,
+        help="""For multilingual models, if you specify translate, the output
+        will be in English.
+        """,
+    )
+
+    parser.add_argument(
+        "--decoding-method",
+        type=str,
+        default="greedy_search",
+        help="""Valid values are greedy_search and modified_beam_search.
+        modified_beam_search is valid only for transducer models.
+        """,
+    )
+    parser.add_argument(
+        "--debug",
+        type=bool,
+        default=False,
+        help="True to show debug messages when loading modes.",
+    )
+
+    parser.add_argument(
+        "--sample-rate",
+        type=int,
+        default=16000,
+        help="""Sample rate of the feature extractor. Must match the one
+        expected by the model.""",
+    )
+
+    parser.add_argument(
+        "--feature-dim",
+        type=int,
+        default=80,
+        help="Feature dimension. Must match the one expected by the model",
+    )
+
+    return parser.parse_args()
+
+
+def assert_file_exists(filename: str):
+    assert Path(filename).is_file(), (
+        f"{filename} does not exist!\n"
+        "Please refer to "
+        "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
+    )
+
+
+def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer:
+    if args.encoder:
+        assert len(args.paraformer) == 0, args.paraformer
+        assert len(args.whisper_encoder) == 0, args.whisper_encoder
+        assert len(args.whisper_decoder) == 0, args.whisper_decoder
+
+        assert_file_exists(args.encoder)
+        assert_file_exists(args.decoder)
+        assert_file_exists(args.joiner)
+
+        recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
+            encoder=args.encoder,
+            decoder=args.decoder,
+            joiner=args.joiner,
+            tokens=args.tokens,
+            num_threads=args.num_threads,
+            sample_rate=args.sample_rate,
+            feature_dim=args.feature_dim,
+            decoding_method=args.decoding_method,
+            debug=args.debug,
+        )
+    elif args.paraformer:
+        assert len(args.whisper_encoder) == 0, args.whisper_encoder
+        assert len(args.whisper_decoder) == 0, args.whisper_decoder
+
+        assert_file_exists(args.paraformer)
+
+        recognizer = sherpa_onnx.OfflineRecognizer.from_paraformer(
+            paraformer=args.paraformer,
+            tokens=args.tokens,
+            num_threads=args.num_threads,
+            sample_rate=args.sample_rate,
+            feature_dim=args.feature_dim,
+            decoding_method=args.decoding_method,
+            debug=args.debug,
+        )
+    elif args.whisper_encoder:
+        assert_file_exists(args.whisper_encoder)
+        assert_file_exists(args.whisper_decoder)
+
+        recognizer = sherpa_onnx.OfflineRecognizer.from_whisper(
+            encoder=args.whisper_encoder,
+            decoder=args.whisper_decoder,
+            tokens=args.tokens,
+            num_threads=args.num_threads,
+            decoding_method=args.decoding_method,
+            debug=args.debug,
+            language=args.whisper_language,
+            task=args.whisper_task,
+        )
+    else:
+        raise ValueError("Please specify at least one model")
+
+    return recognizer
+
+
+def main():
+    devices = sd.query_devices()
+    if len(devices) == 0:
+        print("No microphone devices found")
+        sys.exit(0)
+
+    print(devices)
+
+    # If you want to select a different input device, please use
+    # sd.default.device[0] = xxx
+    # where xxx is the device number
+
+    default_input_device_idx = sd.default.device[0]
+    print(f'Use default device: {devices[default_input_device_idx]["name"]}')
+
+    args = get_args()
+    assert_file_exists(args.tokens)
+    assert_file_exists(args.silero_vad_model)
+
+    assert args.num_threads > 0, args.num_threads
+
+    assert (
+        args.sample_rate == 16000
+    ), f"Only sample rate 16000 is supported.Given: {args.sample_rate}"
+
+    print("Creating recognizer. Please wait...")
+    recognizer = create_recognizer(args)
+
+    config = sherpa_onnx.VadModelConfig()
+    config.silero_vad.model = args.silero_vad_model
+    config.silero_vad.min_silence_duration = 0.25
+    config.sample_rate = args.sample_rate
+
+    window_size = config.silero_vad.window_size
+
+    vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=100)
+
+    samples_per_read = int(0.1 * args.sample_rate)  # 0.1 second = 100 ms
+
+    print("Started! Please speak")
+
+    buffer = []
+    texts = []
+    with sd.InputStream(channels=1, dtype="float32", samplerate=args.sample_rate) as s:
+        while True:
+            samples, _ = s.read(samples_per_read)  # a blocking read
+            samples = samples.reshape(-1)
+
+            buffer = np.concatenate([buffer, samples])
+            while len(buffer) > window_size:
+                vad.accept_waveform(buffer[:window_size])
+                buffer = buffer[window_size:]
+
+            while not vad.empty():
+                stream = recognizer.create_stream()
+                stream.accept_waveform(args.sample_rate, vad.front.samples)
+
+                vad.pop()
+                recognizer.decode_stream(stream)
+
+                text = stream.result.text.strip().lower()
+                if len(text):
+                    idx = len(texts)
+                    texts.append(text)
+                    print(f"{idx}: {text}")
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except KeyboardInterrupt:
+        print("\nCaught Ctrl + C. Exiting")