From ad3c8107e0172404b5394c467ff6ce666ca16760 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sun, 27 Oct 2024 12:21:16 +0800 Subject: [PATCH] Add Pascal API for Moonshine models (#1482) --- .github/workflows/pascal.yaml | 8 + .../non-streaming-asr/.gitignore | 1 + .../non-streaming-asr/moonshine.pas | 80 ++++++++++ .../non-streaming-asr/run-moonshine.sh | 42 ++++++ .../vad-with-non-streaming-asr/.gitignore | 1 + .../run-vad-with-moonshine.sh | 49 ++++++ .../vad_with_moonshine.pas | 139 ++++++++++++++++++ sherpa-onnx/pascal-api/sherpa_onnx.pas | 37 ++++- 8 files changed, 354 insertions(+), 3 deletions(-) create mode 100644 pascal-api-examples/non-streaming-asr/moonshine.pas create mode 100755 pascal-api-examples/non-streaming-asr/run-moonshine.sh create mode 100755 pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-moonshine.sh create mode 100644 pascal-api-examples/vad-with-non-streaming-asr/vad_with_moonshine.pas diff --git a/.github/workflows/pascal.yaml b/.github/workflows/pascal.yaml index ba9a731633..306ae64800 100644 --- a/.github/workflows/pascal.yaml +++ b/.github/workflows/pascal.yaml @@ -165,6 +165,10 @@ jobs: cd ./pascal-api-examples pushd vad-with-non-streaming-asr + time ./run-vad-with-moonshine.sh + rm -rf sherpa-onnx-* + echo "---" + time ./run-vad-with-whisper.sh rm -rf sherpa-onnx-* echo "---" @@ -220,6 +224,10 @@ jobs: rm -rf sherpa-onnx-* echo "---" + ./run-moonshine.sh + rm -rf sherpa-onnx-* + echo "---" + ./run-whisper.sh rm -rf sherpa-onnx-* echo "---" diff --git a/pascal-api-examples/non-streaming-asr/.gitignore b/pascal-api-examples/non-streaming-asr/.gitignore index fbcf1c9680..aba0585a39 100644 --- a/pascal-api-examples/non-streaming-asr/.gitignore +++ b/pascal-api-examples/non-streaming-asr/.gitignore @@ -7,3 +7,4 @@ paraformer paraformer_itn sense_voice telespeech_ctc +moonshine diff --git a/pascal-api-examples/non-streaming-asr/moonshine.pas b/pascal-api-examples/non-streaming-asr/moonshine.pas new file mode 100644 index 0000000000..04597ad645 --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/moonshine.pas @@ -0,0 +1,80 @@ +{ Copyright (c) 2024 Xiaomi Corporation } + +{ +This file shows how to use a non-streaming Moonshine model +to decode files. + +You can download the model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +} + +program moonshine; + +{$mode objfpc} + +uses + sherpa_onnx, + DateUtils, + SysUtils; + +var + Wave: TSherpaOnnxWave; + WaveFilename: AnsiString; + + Config: TSherpaOnnxOfflineRecognizerConfig; + Recognizer: TSherpaOnnxOfflineRecognizer; + Stream: TSherpaOnnxOfflineStream; + RecognitionResult: TSherpaOnnxOfflineRecognizerResult; + + Start: TDateTime; + Stop: TDateTime; + + Elapsed: Single; + Duration: Single; + RealTimeFactor: Single; +begin + Initialize(Config); + + Config.ModelConfig.Moonshine.Preprocessor := './sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx'; + Config.ModelConfig.Moonshine.Encoder := './sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx'; + Config.ModelConfig.Moonshine.UncachedDecoder := './sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx'; + Config.ModelConfig.Moonshine.CachedDecoder := './sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx'; + + Config.ModelConfig.Tokens := './sherpa-onnx-moonshine-tiny-en-int8/tokens.txt'; + Config.ModelConfig.Provider := 'cpu'; + Config.ModelConfig.NumThreads := 1; + Config.ModelConfig.Debug := False; + + WaveFilename := './sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav'; + + Wave := SherpaOnnxReadWave(WaveFilename); + + Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config); + Stream := Recognizer.CreateStream(); + Start := Now; + + Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + + RecognitionResult := Recognizer.GetResult(Stream); + + Stop := Now; + + Elapsed := MilliSecondsBetween(Stop, Start) / 1000; + Duration := Length(Wave.Samples) / Wave.SampleRate; + RealTimeFactor := Elapsed / Duration; + + WriteLn(RecognitionResult.ToString); + WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads])); + WriteLn(Format('Elapsed %.3f s', [Elapsed])); + WriteLn(Format('Wave duration %.3f s', [Duration])); + WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor])); + + {Free resources to avoid memory leak. + + Note: You don't need to invoke them for this simple script. + However, you have to invoke them in your own large/complex project. + } + FreeAndNil(Stream); + FreeAndNil(Recognizer); +end. diff --git a/pascal-api-examples/non-streaming-asr/run-moonshine.sh b/pascal-api-examples/non-streaming-asr/run-moonshine.sh new file mode 100755 index 0000000000..9486b06e97 --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/run-moonshine.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + ls -lh lib + popd +fi + +if [ ! -f ./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +fi + +fpc \ + -dSHERPA_ONNX_USE_SHARED_LIBS \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./moonshine.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./moonshine diff --git a/pascal-api-examples/vad-with-non-streaming-asr/.gitignore b/pascal-api-examples/vad-with-non-streaming-asr/.gitignore index 4718ed421b..d499ad3b37 100644 --- a/pascal-api-examples/vad-with-non-streaming-asr/.gitignore +++ b/pascal-api-examples/vad-with-non-streaming-asr/.gitignore @@ -1,3 +1,4 @@ !run-*.sh vad_with_whisper vad_with_sense_voice +vad_with_moonshine diff --git a/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-moonshine.sh b/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-moonshine.sh new file mode 100755 index 0000000000..fdf04b639a --- /dev/null +++ b/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-moonshine.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + popd +fi + +if [[ ! -f ./silero_vad.onnx ]]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +fi + +if [ ! -f ./Obama.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav +fi + +if [ ! -f ./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +fi + +fpc \ + -dSHERPA_ONNX_USE_SHARED_LIBS \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./vad_with_moonshine.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./vad_with_moonshine diff --git a/pascal-api-examples/vad-with-non-streaming-asr/vad_with_moonshine.pas b/pascal-api-examples/vad-with-non-streaming-asr/vad_with_moonshine.pas new file mode 100644 index 0000000000..50a2e95d26 --- /dev/null +++ b/pascal-api-examples/vad-with-non-streaming-asr/vad_with_moonshine.pas @@ -0,0 +1,139 @@ +{ Copyright (c) 2024 Xiaomi Corporation } + +{ +This file shows how to use a non-streaming Moonshine model +with silero VAD to decode files. + +You can download the model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +} + +program vad_with_moonshine; + +{$mode objfpc} + +uses + sherpa_onnx, + SysUtils; + +function CreateVad(): TSherpaOnnxVoiceActivityDetector; +var + Config: TSherpaOnnxVadModelConfig; + + SampleRate: Integer; + WindowSize: Integer; +begin + Initialize(Config); + + SampleRate := 16000; {Please don't change it unless you know the details} + WindowSize := 512; {Please don't change it unless you know the details} + + Config.SileroVad.Model := './silero_vad.onnx'; + Config.SileroVad.MinSpeechDuration := 0.5; + Config.SileroVad.MinSilenceDuration := 0.5; + Config.SileroVad.Threshold := 0.5; + Config.SileroVad.WindowSize := WindowSize; + Config.NumThreads:= 1; + Config.Debug:= True; + Config.Provider:= 'cpu'; + Config.SampleRate := SampleRate; + + Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30); +end; + +function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer; +var + Config: TSherpaOnnxOfflineRecognizerConfig; +begin + Initialize(Config); + + Config.ModelConfig.Moonshine.Preprocessor := './sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx'; + Config.ModelConfig.Moonshine.Encoder := './sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx'; + Config.ModelConfig.Moonshine.UncachedDecoder := './sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx'; + Config.ModelConfig.Moonshine.CachedDecoder := './sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx'; + + Config.ModelConfig.Tokens := './sherpa-onnx-moonshine-tiny-en-int8/tokens.txt'; + Config.ModelConfig.Provider := 'cpu'; + Config.ModelConfig.NumThreads := 1; + Config.ModelConfig.Debug := False; + + Result := TSherpaOnnxOfflineRecognizer.Create(Config); +end; + +var + Wave: TSherpaOnnxWave; + + Recognizer: TSherpaOnnxOfflineRecognizer; + Vad: TSherpaOnnxVoiceActivityDetector; + + Offset: Integer; + WindowSize: Integer; + SpeechSegment: TSherpaOnnxSpeechSegment; + + Start: Single; + Duration: Single; + + Stream: TSherpaOnnxOfflineStream; + RecognitionResult: TSherpaOnnxOfflineRecognizerResult; +begin + Vad := CreateVad(); + Recognizer := CreateOfflineRecognizer(); + + Wave := SherpaOnnxReadWave('./Obama.wav'); + if Wave.SampleRate <> Vad.Config.SampleRate then + begin + WriteLn(Format('Expected sample rate: %d. Given: %d', + [Vad.Config.SampleRate, Wave.SampleRate])); + + Exit; + end; + + WindowSize := Vad.Config.SileroVad.WindowSize; + Offset := 0; + while Offset + WindowSize <= Length(Wave.Samples) do + begin + Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize); + Offset += WindowSize; + + while not Vad.IsEmpty do + begin + SpeechSegment := Vad.Front(); + Vad.Pop(); + Stream := Recognizer.CreateStream(); + + Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + RecognitionResult := Recognizer.GetResult(Stream); + + Start := SpeechSegment.Start / Wave.SampleRate; + Duration := Length(SpeechSegment.Samples) / Wave.SampleRate; + WriteLn(Format('%.3f -- %.3f %s', + [Start, Start + Duration, RecognitionResult.Text])); + + FreeAndNil(Stream); + end; + end; + + Vad.Flush; + + while not Vad.IsEmpty do + begin + SpeechSegment := Vad.Front(); + Vad.Pop(); + Stream := Recognizer.CreateStream(); + + Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + RecognitionResult := Recognizer.GetResult(Stream); + + Start := SpeechSegment.Start / Wave.SampleRate; + Duration := Length(SpeechSegment.Samples) / Wave.SampleRate; + WriteLn(Format('%.3f -- %.3f %s', + [Start, Start + Duration, RecognitionResult.Text])); + + FreeAndNil(Stream); + end; + + FreeAndNil(Recognizer); + FreeAndNil(Vad); +end. diff --git a/sherpa-onnx/pascal-api/sherpa_onnx.pas b/sherpa-onnx/pascal-api/sherpa_onnx.pas index 1b24dec805..cff2157593 100644 --- a/sherpa-onnx/pascal-api/sherpa_onnx.pas +++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas @@ -250,6 +250,14 @@ TSherpaOnnxOfflineWhisperModelConfig = record class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineWhisperModelConfig); end; + TSherpaOnnxOfflineMoonshineModelConfig = record + Preprocessor: AnsiString; + Encoder: AnsiString; + UncachedDecoder: AnsiString; + CachedDecoder: AnsiString; + function ToString: AnsiString; + end; + TSherpaOnnxOfflineTdnnModelConfig = record Model: AnsiString; function ToString: AnsiString; @@ -285,6 +293,7 @@ TSherpaOnnxOfflineModelConfig = record BpeVocab: AnsiString; TeleSpeechCtc: AnsiString; SenseVoice: TSherpaOnnxOfflineSenseVoiceModelConfig; + Moonshine: TSherpaOnnxOfflineMoonshineModelConfig; class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig); function ToString: AnsiString; end; @@ -617,6 +626,12 @@ SherpaOnnxOfflineWhisperModelConfig = record Task: PAnsiChar; TailPaddings: cint32; end; + SherpaOnnxOfflineMoonshineModelConfig = record + Preprocessor: PAnsiChar; + Encoder: PAnsiChar; + UncachedDecoder: PAnsiChar; + CachedDecoder: PAnsiChar; + end; SherpaOnnxOfflineTdnnModelConfig = record Model: PAnsiChar; end; @@ -644,6 +659,7 @@ SherpaOnnxOfflineModelConfig = record BpeVocab: PAnsiChar; TeleSpeechCtc: PAnsiChar; SenseVoice: SherpaOnnxOfflineSenseVoiceModelConfig; + Moonshine: SherpaOnnxOfflineMoonshineModelConfig; end; SherpaOnnxOfflineRecognizerConfig = record @@ -1312,6 +1328,16 @@ function TSherpaOnnxOfflineWhisperModelConfig.ToString: AnsiString; [Self.Encoder, Self.Decoder, Self.Language, Self.Task, Self.TailPaddings]); end; +function TSherpaOnnxOfflineMoonshineModelConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineMoonshineModelConfig(' + + 'Preprocessor := %s, ' + + 'Encoder := %s, ' + + 'UncachedDecoder := %s, ' + + 'CachedDecoder := %s)', + [Self.Preprocessor, Self.Encoder, Self.UncachedDecoder, Self.CachedDecoder]); +end; + function TSherpaOnnxOfflineTdnnModelConfig.ToString: AnsiString; begin Result := Format('TSherpaOnnxOfflineTdnnModelConfig(Model := %s)', @@ -1353,13 +1379,14 @@ function TSherpaOnnxOfflineModelConfig.ToString: AnsiString; 'ModelingUnit := %s, ' + 'BpeVocab := %s, ' + 'TeleSpeechCtc := %s, ' + - 'SenseVoice := %s' + + 'SenseVoice := %s, ' + + 'Moonshine := %s' + ')', [Self.Transducer.ToString, Self.Paraformer.ToString, Self.NeMoCtc.ToString, Self.Whisper.ToString, Self.Tdnn.ToString, Self.Tokens, Self.NumThreads, Self.Debug.ToString, Self.Provider, Self.ModelType, Self.ModelingUnit, Self.BpeVocab, - Self.TeleSpeechCtc, Self.SenseVoice.ToString + Self.TeleSpeechCtc, Self.SenseVoice.ToString, Self.Moonshine.ToString ]); end; @@ -1407,7 +1434,6 @@ constructor TSherpaOnnxOfflineRecognizer.Create(Config: TSherpaOnnxOfflineRecogn C.ModelConfig.Tdnn.Model := PAnsiChar(Config.ModelConfig.Tdnn.Model); - C.ModelConfig.Tokens := PAnsiChar(Config.ModelConfig.Tokens); C.ModelConfig.NumThreads := Config.ModelConfig.NumThreads; C.ModelConfig.Debug := Ord(Config.ModelConfig.Debug); @@ -1421,6 +1447,11 @@ constructor TSherpaOnnxOfflineRecognizer.Create(Config: TSherpaOnnxOfflineRecogn C.ModelConfig.SenseVoice.Language := PAnsiChar(Config.ModelConfig.SenseVoice.Language); C.ModelConfig.SenseVoice.UseItn := Ord(Config.ModelConfig.SenseVoice.UseItn); + C.ModelConfig.Moonshine.Preprocessor := PAnsiChar(Config.ModelConfig.Moonshine.Preprocessor); + C.ModelConfig.Moonshine.Encoder := PAnsiChar(Config.ModelConfig.Moonshine.Encoder); + C.ModelConfig.Moonshine.UncachedDecoder := PAnsiChar(Config.ModelConfig.Moonshine.UncachedDecoder); + C.ModelConfig.Moonshine.CachedDecoder := PAnsiChar(Config.ModelConfig.Moonshine.CachedDecoder); + C.LMConfig.Model := PAnsiChar(Config.LMConfig.Model); C.LMConfig.Scale := Config.LMConfig.Scale;