Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Dart API for Moonshine models. #1481

Merged
merged 1 commit into from
Oct 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .github/scripts/test-dart.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ echo "----zipformer transducer----"
./run-zipformer-transducer.sh
rm -rf sherpa-onnx-*

echo "----moonshine----"
./run-moonshine.sh
rm -rf sherpa-onnx-*

echo "----whisper----"
./run-whisper.sh
rm -rf sherpa-onnx-*
Expand Down Expand Up @@ -77,6 +81,10 @@ echo '----------TeleSpeech CTC----------'
./run-telespeech-ctc.sh
rm -rf sherpa-onnx-*

echo '----------moonshine----------'
./run-moonshine.sh
rm -rf sherpa-onnx-*

echo '----------whisper----------'
./run-whisper.sh
rm -rf sherpa-onnx-*
Expand Down
69 changes: 69 additions & 0 deletions dart-api-examples/non-streaming-asr/bin/moonshine.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Copyright (c) 2024 Xiaomi Corporation
import 'dart:io';

import 'package:args/args.dart';
import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;

import './init.dart';

void main(List<String> arguments) async {
await initSherpaOnnx();

final parser = ArgParser()
..addOption('preprocessor',
help: 'Path to the moonshine preprocessor model')
..addOption('encoder', help: 'Path to the moonshine encoder model')
..addOption('uncached-decoder',
help: 'Path to moonshine uncached decoder model')
..addOption('cached-decoder',
help: 'Path to moonshine cached decoder model')
..addOption('tokens', help: 'Path to tokens.txt')
..addOption('input-wav', help: 'Path to input.wav to transcribe');

final res = parser.parse(arguments);
if (res['preprocessor'] == null ||
res['encoder'] == null ||
res['uncached-decoder'] == null ||
res['cached-decoder'] == null ||
res['tokens'] == null ||
res['input-wav'] == null) {
print(parser.usage);
exit(1);
}

final preprocessor = res['preprocessor'] as String;
final encoder = res['encoder'] as String;
final uncachedDecoder = res['uncached-decoder'] as String;
final cachedDecoder = res['cached-decoder'] as String;
final tokens = res['tokens'] as String;
final inputWav = res['input-wav'] as String;

final moonshine = sherpa_onnx.OfflineMoonshineModelConfig(
preprocessor: preprocessor,
encoder: encoder,
uncachedDecoder: uncachedDecoder,
cachedDecoder: cachedDecoder,
);

final modelConfig = sherpa_onnx.OfflineModelConfig(
moonshine: moonshine,
tokens: tokens,
debug: false,
numThreads: 1,
);
final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig);
final recognizer = sherpa_onnx.OfflineRecognizer(config);

final waveData = sherpa_onnx.readWave(inputWav);
final stream = recognizer.createStream();

stream.acceptWaveform(
samples: waveData.samples, sampleRate: waveData.sampleRate);
recognizer.decode(stream);

final result = recognizer.getResult(stream);
print(result.text);

stream.free();
recognizer.free();
}
20 changes: 20 additions & 0 deletions dart-api-examples/non-streaming-asr/run-moonshine.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/usr/bin/env bash

set -ex

dart pub get

if [ ! -f ./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
fi

dart run \
./bin/moonshine.dart \
--preprocessor ./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx \
--encoder ./sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx \
--uncached-decoder ./sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx \
--cached-decoder ./sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx \
--tokens ./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt \
--input-wav ./sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav
134 changes: 134 additions & 0 deletions dart-api-examples/vad-with-non-streaming-asr/bin/moonshine.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
// Copyright (c) 2024 Xiaomi Corporation
import 'dart:io';
import 'dart:typed_data';

import 'package:args/args.dart';
import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;

import './init.dart';

void main(List<String> arguments) async {
await initSherpaOnnx();

final parser = ArgParser()
..addOption('silero-vad', help: 'Path to silero_vad.onnx')
..addOption('preprocessor',
help: 'Path to the moonshine preprocessor model')
..addOption('encoder', help: 'Path to the moonshine encoder model')
..addOption('uncached-decoder',
help: 'Path to moonshine uncached decoder model')
..addOption('cached-decoder',
help: 'Path to moonshine cached decoder model')
..addOption('tokens', help: 'Path to tokens.txt')
..addOption('input-wav', help: 'Path to input.wav to transcribe');

final res = parser.parse(arguments);
if (res['silero-vad'] == null ||
res['preprocessor'] == null ||
res['encoder'] == null ||
res['uncached-decoder'] == null ||
res['cached-decoder'] == null ||
res['tokens'] == null ||
res['input-wav'] == null) {
print(parser.usage);
exit(1);
}

// create VAD
final sileroVad = res['silero-vad'] as String;

final sileroVadConfig = sherpa_onnx.SileroVadModelConfig(
model: sileroVad,
minSilenceDuration: 0.25,
minSpeechDuration: 0.5,
maxSpeechDuration: 5.0,
);

final vadConfig = sherpa_onnx.VadModelConfig(
sileroVad: sileroVadConfig,
numThreads: 1,
debug: true,
);

final vad = sherpa_onnx.VoiceActivityDetector(
config: vadConfig, bufferSizeInSeconds: 10);

// create whisper recognizer
final preprocessor = res['preprocessor'] as String;
final encoder = res['encoder'] as String;
final uncachedDecoder = res['uncached-decoder'] as String;
final cachedDecoder = res['cached-decoder'] as String;
final tokens = res['tokens'] as String;
final inputWav = res['input-wav'] as String;

final moonshine = sherpa_onnx.OfflineMoonshineModelConfig(
preprocessor: preprocessor,
encoder: encoder,
uncachedDecoder: uncachedDecoder,
cachedDecoder: cachedDecoder,
);
final modelConfig = sherpa_onnx.OfflineModelConfig(
moonshine: moonshine,
tokens: tokens,
debug: false,
numThreads: 1,
);
final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig);
final recognizer = sherpa_onnx.OfflineRecognizer(config);

final waveData = sherpa_onnx.readWave(inputWav);
if (waveData.sampleRate != 16000) {
print('Only 16000 Hz is supported. Given: ${waveData.sampleRate}');
exit(1);
}

int numSamples = waveData.samples.length;
int numIter = numSamples ~/ vadConfig.sileroVad.windowSize;

for (int i = 0; i != numIter; ++i) {
int start = i * vadConfig.sileroVad.windowSize;
vad.acceptWaveform(Float32List.sublistView(
waveData.samples, start, start + vadConfig.sileroVad.windowSize));

while (!vad.isEmpty()) {
final samples = vad.front().samples;
final startTime = vad.front().start.toDouble() / waveData.sampleRate;
final endTime =
startTime + samples.length.toDouble() / waveData.sampleRate;

final stream = recognizer.createStream();
stream.acceptWaveform(samples: samples, sampleRate: waveData.sampleRate);
recognizer.decode(stream);

final result = recognizer.getResult(stream);
stream.free();
print(
'${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}');

vad.pop();
}
}

vad.flush();

while (!vad.isEmpty()) {
final samples = vad.front().samples;
final startTime = vad.front().start.toDouble() / waveData.sampleRate;
final endTime = startTime + samples.length.toDouble() / waveData.sampleRate;

final stream = recognizer.createStream();
stream.acceptWaveform(samples: samples, sampleRate: waveData.sampleRate);
recognizer.decode(stream);

final result = recognizer.getResult(stream);
stream.free();
print(
'${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}');

vad.pop();
}

vad.free();

recognizer.free();
}
29 changes: 29 additions & 0 deletions dart-api-examples/vad-with-non-streaming-asr/run-moonshine.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env bash

set -ex

dart pub get

if [ ! -f ./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
fi

if [ ! -f ./Obama.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
fi

if [[ ! -f ./silero_vad.onnx ]]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
fi

dart run \
./bin/moonshine.dart \
--silero-vad ./silero_vad.onnx \
--preprocessor ./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx \
--encoder ./sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx \
--uncached-decoder ./sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx \
--cached-decoder ./sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx \
--tokens ./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt \
--input-wav ./Obama.wav
35 changes: 34 additions & 1 deletion flutter/sherpa_onnx/lib/src/offline_recognizer.dart
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,24 @@ class OfflineWhisperModelConfig {
final int tailPaddings;
}

class OfflineMoonshineModelConfig {
const OfflineMoonshineModelConfig(
{this.preprocessor = '',
this.encoder = '',
this.uncachedDecoder = '',
this.cachedDecoder = ''});

@override
String toString() {
return 'OfflineMoonshineModelConfig(preprocessor: $preprocessor, encoder: $encoder, uncachedDecoder: $uncachedDecoder, cachedDecoder: $cachedDecoder)';
}

final String preprocessor;
final String encoder;
final String uncachedDecoder;
final String cachedDecoder;
}

class OfflineTdnnModelConfig {
const OfflineTdnnModelConfig({this.model = ''});

Expand Down Expand Up @@ -116,6 +134,7 @@ class OfflineModelConfig {
this.whisper = const OfflineWhisperModelConfig(),
this.tdnn = const OfflineTdnnModelConfig(),
this.senseVoice = const OfflineSenseVoiceModelConfig(),
this.moonshine = const OfflineMoonshineModelConfig(),
required this.tokens,
this.numThreads = 1,
this.debug = true,
Expand All @@ -128,7 +147,7 @@ class OfflineModelConfig {

@override
String toString() {
return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, senseVoice: $senseVoice, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab, telespeechCtc: $telespeechCtc)';
return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, senseVoice: $senseVoice, moonshine: $moonshine, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab, telespeechCtc: $telespeechCtc)';
}

final OfflineTransducerModelConfig transducer;
Expand All @@ -137,6 +156,7 @@ class OfflineModelConfig {
final OfflineWhisperModelConfig whisper;
final OfflineTdnnModelConfig tdnn;
final OfflineSenseVoiceModelConfig senseVoice;
final OfflineMoonshineModelConfig moonshine;

final String tokens;
final int numThreads;
Expand Down Expand Up @@ -257,6 +277,15 @@ class OfflineRecognizer {
c.ref.model.senseVoice.useInverseTextNormalization =
config.model.senseVoice.useInverseTextNormalization ? 1 : 0;

c.ref.model.moonshine.preprocessor =
config.model.moonshine.preprocessor.toNativeUtf8();
c.ref.model.moonshine.encoder =
config.model.moonshine.encoder.toNativeUtf8();
c.ref.model.moonshine.uncachedDecoder =
config.model.moonshine.uncachedDecoder.toNativeUtf8();
c.ref.model.moonshine.cachedDecoder =
config.model.moonshine.cachedDecoder.toNativeUtf8();

c.ref.model.tokens = config.model.tokens.toNativeUtf8();

c.ref.model.numThreads = config.model.numThreads;
Expand Down Expand Up @@ -294,6 +323,10 @@ class OfflineRecognizer {
calloc.free(c.ref.model.modelType);
calloc.free(c.ref.model.provider);
calloc.free(c.ref.model.tokens);
calloc.free(c.ref.model.moonshine.cachedDecoder);
calloc.free(c.ref.model.moonshine.uncachedDecoder);
calloc.free(c.ref.model.moonshine.encoder);
calloc.free(c.ref.model.moonshine.preprocessor);
calloc.free(c.ref.model.senseVoice.language);
calloc.free(c.ref.model.senseVoice.model);
calloc.free(c.ref.model.tdnn.model);
Expand Down
8 changes: 8 additions & 0 deletions flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,13 @@ final class SherpaOnnxOfflineWhisperModelConfig extends Struct {
external int tailPaddings;
}

final class SherpaOnnxOfflineMoonshineModelConfig extends Struct {
external Pointer<Utf8> preprocessor;
external Pointer<Utf8> encoder;
external Pointer<Utf8> uncachedDecoder;
external Pointer<Utf8> cachedDecoder;
}

final class SherpaOnnxOfflineTdnnModelConfig extends Struct {
external Pointer<Utf8> model;
}
Expand Down Expand Up @@ -236,6 +243,7 @@ final class SherpaOnnxOfflineModelConfig extends Struct {
external Pointer<Utf8> telespeechCtc;

external SherpaOnnxOfflineSenseVoiceModelConfig senseVoice;
external SherpaOnnxOfflineMoonshineModelConfig moonshine;
}

final class SherpaOnnxOfflineRecognizerConfig extends Struct {
Expand Down