Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add C++ and Python API for Kokoro TTS models. #1715

Merged
merged 8 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions .github/scripts/test-offline-tts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,31 @@ which $EXE
# test waves are saved in ./tts
mkdir ./tts

log "------------------------------------------------------------"
log "kokoro-en-v0_19"
log "------------------------------------------------------------"
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
tar xf kokoro-en-v0_19.tar.bz2
rm kokoro-en-v0_19.tar.bz2

# mapping of sid to voice name
# 0->af, 1->af_bella, 2->af_nicole, 3->af_sarah, 4->af_sky, 5->am_adam
# 6->am_michael, 7->bf_emma, 8->bf_isabella, 9->bm_george, 10->bm_lewis

for sid in $(seq 0 10); do
$EXE \
--debug=1 \
--kokoro-model=./kokoro-en-v0_19/model.onnx \
--kokoro-voices=./kokoro-en-v0_19/voices.bin \
--kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
--kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
--num-threads=2 \
--sid=$sid \
--output-filename="./tts/kokoro-$sid.wav" \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
done
rm -rf kokoro-en-v0_19

log "------------------------------------------------------------"
log "matcha-icefall-en_US-ljspeech"
log "------------------------------------------------------------"
Expand Down
19 changes: 19 additions & 0 deletions .github/scripts/test-python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,25 @@ log "Offline TTS test"
# test waves are saved in ./tts
mkdir ./tts

log "kokoro-en-v0_19 test"

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
tar xf kokoro-en-v0_19.tar.bz2
rm kokoro-en-v0_19.tar.bz2

python3 ./python-api-examples/offline-tts.py \
--debug=1 \
--kokoro-model=./kokoro-en-v0_19/model.onnx \
--kokoro-voices=./kokoro-en-v0_19/voices.bin \
--kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
--kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
--num-threads=2 \
--sid=10 \
--output-filename="./tts/kokoro-10.wav" \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."

rm -rf kokoro-en-v0_19

log "matcha-ljspeech-en test"

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
Expand Down
63 changes: 58 additions & 5 deletions python-api-examples/offline-tts-play.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

Usage:

Example (1/5)
Example (1/6)

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
tar xf vits-piper-en_US-amy-low.tar.bz2
Expand All @@ -23,7 +23,7 @@
--output-filename=./generated.wav \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."

Example (2/5)
Example (2/6)

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
tar xvf vits-zh-aishell3.tar.bz2
Expand All @@ -37,7 +37,7 @@
--output-filename=./liubei-21.wav \
"勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"

Example (3/5)
Example (3/6)

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
Expand All @@ -53,7 +53,7 @@
--output-filename=./test-2.wav \
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"

Example (4/5)
Example (4/6)

curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
tar xvf matcha-icefall-zh-baker.tar.bz2
Expand All @@ -71,7 +71,7 @@
--output-filename=./test-matcha.wav \
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"

Example (5/5)
Example (5/6)

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
Expand All @@ -88,6 +88,22 @@
--num-threads=2 \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."

Example (6/6)

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
tar xf kokoro-en-v0_19.tar.bz2
rm kokoro-en-v0_19.tar.bz2

python3 ./python-api-examples/offline-tts.py \
--debug=1 \
--kokoro-model=./kokoro-en-v0_19/model.onnx \
--kokoro-voices=./kokoro-en-v0_19/voices.bin \
--kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
--kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
--num-threads=2 \
--sid=10 \
--output-filename="./kokoro-10.wav" \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."

You can find more models at
https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
Expand Down Expand Up @@ -202,13 +218,44 @@ def add_matcha_args(parser):
)


def add_kokoro_args(parser):
parser.add_argument(
"--kokoro-model",
type=str,
default="",
help="Path to model.onnx for kokoro",
)

parser.add_argument(
"--kokoro-voices",
type=str,
default="",
help="Path to voices.bin for kokoro",
)

parser.add_argument(
"--kokoro-tokens",
type=str,
default="",
help="Path to tokens.txt for kokoro",
)

parser.add_argument(
"--kokoro-data-dir",
type=str,
default="",
help="Path to the dict directory of espeak-ng.",
)


def get_args():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)

add_vits_args(parser)
add_matcha_args(parser)
add_kokoro_args(parser)

parser.add_argument(
"--tts-rule-fsts",
Expand Down Expand Up @@ -407,6 +454,12 @@ def main():
data_dir=args.matcha_data_dir,
dict_dir=args.matcha_dict_dir,
),
kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig(
model=args.kokoro_model,
voices=args.kokoro_voices,
tokens=args.kokoro_tokens,
data_dir=args.kokoro_data_dir,
),
provider=args.provider,
debug=args.debug,
num_threads=args.num_threads,
Expand Down
66 changes: 60 additions & 6 deletions python-api-examples/offline-tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

Usage:

Example (1/5)
Example (1/6)

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
tar xf vits-piper-en_US-amy-low.tar.bz2
Expand All @@ -24,7 +24,7 @@
--output-filename=./generated.wav \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."

Example (2/5)
Example (2/6)

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
tar xvf vits-icefall-zh-aishell3.tar.bz2
Expand All @@ -38,7 +38,7 @@
--output-filename=./liubei-21.wav \
"勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"

Example (3/5)
Example (3/6)

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
Expand All @@ -54,7 +54,7 @@
--output-filename=./test-2.wav \
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"

Example (4/5)
Example (4/6)

curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
tar xvf matcha-icefall-zh-baker.tar.bz2
Expand All @@ -72,7 +72,7 @@
--output-filename=./test-matcha.wav \
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"

Example (5/5)
Example (5/6)

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
Expand All @@ -89,6 +89,23 @@
--num-threads=2 \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."

Example (6/6)

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
tar xf kokoro-en-v0_19.tar.bz2
rm kokoro-en-v0_19.tar.bz2

python3 ./python-api-examples/offline-tts.py \
--debug=1 \
--kokoro-model=./kokoro-en-v0_19/model.onnx \
--kokoro-voices=./kokoro-en-v0_19/voices.bin \
--kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
--kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
--num-threads=2 \
--sid=10 \
--output-filename="./kokoro-10.wav" \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."

You can find more models at
https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models

Expand Down Expand Up @@ -188,13 +205,44 @@ def add_matcha_args(parser):
)


def add_kokoro_args(parser):
parser.add_argument(
"--kokoro-model",
type=str,
default="",
help="Path to model.onnx for kokoro",
)

parser.add_argument(
"--kokoro-voices",
type=str,
default="",
help="Path to voices.bin for kokoro",
)

parser.add_argument(
"--kokoro-tokens",
type=str,
default="",
help="Path to tokens.txt for kokoro",
)

parser.add_argument(
"--kokoro-data-dir",
type=str,
default="",
help="Path to the dict directory of espeak-ng.",
)


def get_args():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)

add_vits_args(parser)
add_matcha_args(parser)
add_kokoro_args(parser)

parser.add_argument(
"--tts-rule-fsts",
Expand All @@ -206,7 +254,7 @@ def get_args():
parser.add_argument(
"--max-num-sentences",
type=int,
default=2,
default=1,
help="""Max number of sentences in a batch to avoid OOM if the input
text is very long. Set it to -1 to process all the sentences in a
single batch. A smaller value does not mean it is slower compared
Expand Down Expand Up @@ -289,6 +337,12 @@ def main():
data_dir=args.matcha_data_dir,
dict_dir=args.matcha_dict_dir,
),
kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig(
model=args.kokoro_model,
voices=args.kokoro_voices,
tokens=args.kokoro_tokens,
data_dir=args.kokoro_data_dir,
),
provider=args.provider,
debug=args.debug,
num_threads=args.num_threads,
Expand Down
2 changes: 2 additions & 0 deletions sherpa-onnx/csrc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ if(SHERPA_ONNX_ENABLE_TTS)
offline-tts-character-frontend.cc
offline-tts-frontend.cc
offline-tts-impl.cc
offline-tts-kokoro-model-config.cc
offline-tts-kokoro-model.cc
offline-tts-matcha-model-config.cc
offline-tts-matcha-model.cc
offline-tts-model-config.cc
Expand Down
2 changes: 1 addition & 1 deletion sherpa-onnx/csrc/melo-tts-lexicon.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <vector>

#include "sherpa-onnx/csrc/offline-tts-frontend.h"
#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h"
#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h"

namespace sherpa_onnx {

Expand Down
2 changes: 1 addition & 1 deletion sherpa-onnx/csrc/offline-tts-character-frontend.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include <vector>

#include "sherpa-onnx/csrc/offline-tts-frontend.h"
#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h"
#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h"

namespace sherpa_onnx {

Expand Down
10 changes: 8 additions & 2 deletions sherpa-onnx/csrc/offline-tts-impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "rawfile/raw_file_manager.h"
#endif

#include "sherpa-onnx/csrc/offline-tts-kokoro-impl.h"
#include "sherpa-onnx/csrc/offline-tts-matcha-impl.h"
#include "sherpa-onnx/csrc/offline-tts-vits-impl.h"

Expand All @@ -37,18 +38,23 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create(
const OfflineTtsConfig &config) {
if (!config.model.vits.model.empty()) {
return std::make_unique<OfflineTtsVitsImpl>(config);
} else if (!config.model.matcha.acoustic_model.empty()) {
return std::make_unique<OfflineTtsMatchaImpl>(config);
}
return std::make_unique<OfflineTtsMatchaImpl>(config);

return std::make_unique<OfflineTtsKokoroImpl>(config);
}

template <typename Manager>
std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create(
Manager *mgr, const OfflineTtsConfig &config) {
if (!config.model.vits.model.empty()) {
return std::make_unique<OfflineTtsVitsImpl>(mgr, config);
} else if (!config.model.matcha.acoustic_model.empty()) {
return std::make_unique<OfflineTtsMatchaImpl>(mgr, config);
}

return std::make_unique<OfflineTtsMatchaImpl>(mgr, config);
return std::make_unique<OfflineTtsKokoroImpl>(mgr, config);
}

#if __ANDROID_API__ >= 9
Expand Down
Loading
Loading