Export Kokoro 1.0 to sherpa-onnx (#1788)

k2-fsa · Feb 5, 2025 · 08cefe8 · 08cefe8
1 parent 8677d83
commit 08cefe8
Show file tree

Hide file tree

Showing 13 changed files with 707 additions and 15 deletions.
diff --git a/.github/workflows/export-kokoro.yaml b/.github/workflows/export-kokoro.yaml
@@ -4,6 +4,7 @@ on:
   push:
     branches:
       - export-kokoro
+      - kokoro-1.0-2
 
   workflow_dispatch:
 
@@ -14,12 +15,13 @@ concurrency:
 jobs:
   export-kokoro-to-onnx:
     if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
-    name: export kokoro
+    name: export kokoro ${{ matrix.version }}
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
         os: [ubuntu-latest]
+        version: ["0.19", "1.0"]
         python-version: ["3.10"]
 
     steps:
@@ -33,7 +35,7 @@ jobs:
       - name: Install Python dependencies
         shell: bash
         run: |
-          pip install -q "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
+          pip install "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html misaki[en] misaki[zh] torch==2.6.0+cpu -f https://download.pytorch.org/whl/torch
 
       - name: Run
         shell: bash
@@ -42,9 +44,16 @@ jobs:
           tar xf espeak-ng-data.tar.bz2
           rm espeak-ng-data.tar.bz2
           cd scripts/kokoro
-          ./run.sh
-
-      - name: Collect results
+          v=${{ matrix.version }}
+          if [[ $v = "0.19" ]]; then
+            ./run.sh
+          elif [[ $v == "1.0" ]]; then
+            cd v1.0
+            ./run.sh
+          fi
+
+      - name: Collect results ${{ matrix.version }}
+        if: matrix.version == '0.19'
         shell: bash
         run: |
           src=scripts/kokoro
@@ -53,17 +62,39 @@ jobs:
           mkdir $d
           cp -a LICENSE $d/LICENSE
           cp -a espeak-ng-data $d/
-          cp -v $src/kokoro-v0_19_hf.onnx $d/model.onnx
+          cp -v $src/kokoro-v0_19.onnx $d/model.onnx
           cp -v $src/voices.bin $d/
           cp -v $src/tokens.txt $d/
           cp -v $src/README-new.md $d/README.md
           ls -lh $d/
           tar cjfv $d.tar.bz2 $d
           rm -rf $d
 
-          ls -h $.tar.bz2
+          ls -lh $d.tar.bz2
+
+      - name: Collect results ${{ matrix.version }}
+        if: matrix.version == '1.0'
+        shell: bash
+        run: |
+          src=scripts/kokoro/v1.0
+
+          d=kokoro-multi-lang-v1_0
+          mkdir $d
+          cp -a LICENSE $d/LICENSE
+          cp -a espeak-ng-data $d/
+          cp -v $src/kokoro.onnx $d/model.onnx
+          cp -v $src/voices.bin $d/
+          cp -v $src/tokens.txt $d/
+          cp -v $src/lexicon*.txt $d/
+          cp -v $src/README.md $d/README.md
+          ls -lh $d/
+          tar cjfv $d.tar.bz2 $d
+          rm -rf $d
+
+          ls -lh $d.tar.bz2
 
-      - name: Publish to huggingface
+      - name: Publish to huggingface ${{ matrix.version }}
+        if: matrix.version == '0.19'
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         uses: nick-fields/retry@v3
@@ -92,9 +123,9 @@ jobs:
             cp -a ../espeak-ng-data ./
             mkdir -p test_wavs
 
-            cp -v ../scripts/kokoro/kokoro-v0_19_hf.onnx ./model.onnx
+            cp -v ../scripts/kokoro/kokoro-v0_19.onnx ./model.onnx
 
-            cp -v ../scripts/kokoro/kokoro-v0_19_hf-*.wav ./test_wavs/
+            cp -v ../scripts/kokoro/kokoro-v0_19-*.wav ./test_wavs/
 
             cp -v ../scripts/kokoro/tokens.txt .
             cp -v ../scripts/kokoro/voices.bin .
@@ -111,6 +142,55 @@ jobs:
             git commit -m "add models"
             git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 main || true
 
+      - name: Publish to huggingface ${{ matrix.version }}
+        if: matrix.version == '1.0'
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 20
+          timeout_seconds: 200
+          shell: bash
+          command: |
+            git config --global user.email "csukuangfj@gmail.com"
+            git config --global user.name "Fangjun Kuang"
+
+            rm -rf huggingface
+            export GIT_LFS_SKIP_SMUDGE=1
+            export GIT_CLONE_PROTECTION_ACTIVE=false
+
+            git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_0 huggingface
+            cd huggingface
+            rm -rf ./*
+            git fetch
+            git pull
+
+            git lfs track "cmn_dict"
+            git lfs track "ru_dict"
+            git lfs track "*.wav"
+            git lfs track "lexicon*.txt"
+
+            cp -a ../espeak-ng-data ./
+
+            cp -v ../scripts/kokoro/v1.0/kokoro.onnx ./model.onnx
+
+
+            cp -v ../scripts/kokoro/v1.0/tokens.txt .
+            cp -v ../scripts/kokoro/v1.0/voices.bin .
+            cp -v ../scripts/kokoro/v1.0/lexicon*.txt .
+            cp -v ../scripts/kokoro/v1.0/README.md ./README.md
+            cp -v ../LICENSE ./
+
+            git lfs track "*.onnx"
+            git add .
+
+            ls -lh
+
+            git status
+
+            git commit -m "add models"
+            git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_0 main || true
+
       - name: Release
         uses: svenstaro/upload-release-action@v2
         with:

diff --git a/.gitignore b/.gitignore
@@ -128,3 +128,7 @@ harmony-os/SherpaOnnxHar/sherpa_onnx/CHANGELOG.md
 matcha-icefall-zh-baker
 matcha-icefall-en_US-ljspeech
 kokoro-en-v0_19
+*.pt
+lexicon.txt
+us_gold.json
+us_silver.json
diff --git a/scripts/kokoro/add-meta-data.py → scripts/kokoro/add_meta_data.py b/scripts/kokoro/add-meta-data.py → scripts/kokoro/add_meta_data.py
@@ -69,6 +69,14 @@ def main():
             for k in keys:
                 f.write(voices[k].tobytes())
 
+    speaker2id_str = ""
+    id2speaker_str = ""
+    sep = ""
+    for i, s in enumerate(keys):
+        speaker2id_str += f"{sep}{s}->{i}"
+        id2speaker_str += f"{sep}{i}->{s}"
+        sep = ","
+
     meta_data = {
         "model_type": "kokoro",
         "language": "English",
@@ -78,6 +86,8 @@ def main():
         "voice": "en-us",
         "style_dim": ",".join(map(str, voices[keys[0]].shape)),
         "n_speakers": len(keys),
+        "speaker2id": speaker2id_str,
+        "id2speaker": id2speaker_str,
         "speaker_names": ",".join(keys),
         "model_url": "https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files",
         "see_also": "https://huggingface.co/spaces/hexgrad/Kokoro-TTS",

diff --git a/scripts/kokoro/run.sh b/scripts/kokoro/run.sh
@@ -16,8 +16,8 @@ https://huggingface.co/hexgrad/Kokoro-82M/discussions/14
 EOF
 
 files=(
-kokoro-v0_19_hf.onnx
-# kokoro-v0_19.onnx
+# kokoro-v0_19_hf.onnx
+kokoro-v0_19.onnx
 # kokoro-quant.onnx
 # kokoro-quant-convinteger.onnx
 voices.json
@@ -30,14 +30,14 @@ for f in ${files[@]}; do
 done
 
 models=(
-# kokoro-v0_19
+kokoro-v0_19
 # kokoro-quant
 # kokoro-quant-convinteger
-kokoro-v0_19_hf
+# kokoro-v0_19_hf
 )
 
 for m in ${models[@]}; do
-  ./add-meta-data.py --model $m.onnx --voices ./voices.json
+  ./add_meta_data.py --model $m.onnx --voices ./voices.json
 done
 
 ls -l

diff --git a/scripts/kokoro/v1.0/.gitignore b/scripts/kokoro/v1.0/.gitignore
@@ -0,0 +1,5 @@
+config.json
+*.json
+*.txt
+.add-meta-data.done
+voices
diff --git a/scripts/kokoro/v1.0/README.md b/scripts/kokoro/v1.0/README.md
@@ -0,0 +1,3 @@
+# Introduction
+
+This directory is for kokoro v1.0
diff --git a/scripts/kokoro/v1.0/__init__.py b/scripts/kokoro/v1.0/__init__.py
diff --git a/scripts/kokoro/v1.0/add_meta_data.py b/scripts/kokoro/v1.0/add_meta_data.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+# Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+
+
+import argparse
+import json
+from pathlib import Path
+
+import numpy as np
+import onnx
+import torch
+
+from generate_voices_bin import speaker2id
+
+
+def main():
+    model = onnx.load("./kokoro.onnx")
+    style = torch.load("./voices/af_alloy.pt", weights_only=True, map_location="cpu")
+
+    id2speaker_str = ""
+    speaker2id_str = ""
+    sep = ""
+    for s, i in speaker2id.items():
+        speaker2id_str += f"{sep}{s}->{i}"
+        id2speaker_str += f"{sep}{i}->{s}"
+        sep = ","
+
+    meta_data = {
+        "model_type": "kokoro",
+        "language": "English",
+        "has_espeak": 1,
+        "sample_rate": 24000,
+        "version": 2,
+        "voice": "en-us",
+        "style_dim": ",".join(map(str, style.shape)),
+        "n_speakers": len(speaker2id),
+        "id2speaker": id2speaker_str,
+        "speaker2id": speaker2id_str,
+        "speaker_names": ",".join(map(str, speaker2id.keys())),
+        "model_url": "https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files",
+        "see_also": "https://huggingface.co/spaces/hexgrad/Kokoro-TTS",
+        "see_also_2": "https://huggingface.co/hexgrad/Kokoro-82M",
+        "maintainer": "k2-fsa",
+        "comment": "This is Kokoro v1.0, a multilingual TTS model, supporting English, Chinese, French, Japanese etc.",
+    }
+
+    print(model.metadata_props)
+
+    while len(model.metadata_props):
+        model.metadata_props.pop()
+
+    for key, value in meta_data.items():
+        meta = model.metadata_props.add()
+        meta.key = key
+        meta.value = str(value)
+    print("--------------------")
+
+    print(model.metadata_props)
+
+    onnx.save(model, "./kokoro.onnx")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/kokoro/v1.0/generate_lexicon.py b/scripts/kokoro/v1.0/generate_lexicon.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+# Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+
+import json
+from pypinyin import phrases_dict, pinyin_dict
+from misaki import zh
+from typing import List, Tuple
+
+
+def generate_english_lexicon(kind: str):
+    assert kind in ("us", "gb"), kind
+    # If you want to add new words, please add them to
+    # the user_defined dict.
+    user_defined = {
+        "Kokoro": "kˈOkəɹO",
+        "Misaki": "misˈɑki",
+    }
+
+    user_defined_lower = dict()
+    for k, v in user_defined.items():
+        user_defined_lower[k.lower()] = v
+
+    with open(f"./{kind}_gold.json", encoding="utf-8") as f:
+        gold = json.load(f)
+
+    with open(f"./{kind}_silver.json", encoding="utf-8") as f:
+        silver = json.load(f)
+
+    # words in us_gold has a higher priority than those in s_silver, so
+    # we put us_gold after us_silver below
+    english = {**silver, **gold}
+
+    lexicon = dict()
+    for k, v in english.items():
+        k_lower = k.lower()
+
+        if k_lower in user_defined_lower:
+            print(f"{k} already exist in the user defined dict. Skip adding")
+            continue
+
+        if isinstance(v, str):
+            lexicon[k_lower] = v
+        else:
+            assert isinstance(v, dict), (k, v)
+            assert "DEFAULT" in v, (k, v)
+            lexicon[k_lower] = v["DEFAULT"]
+
+    return list(user_defined_lower.items()) + list(lexicon.items())
+
+
+def generate_chinese_lexicon():
+    word_dict = pinyin_dict.pinyin_dict
+    phrases = phrases_dict.phrases_dict
+
+    g2p = zh.ZHG2P()
+    lexicon = []
+
+    for key in word_dict:
+        if not (0x4E00 <= key <= 0x9FFF):
+            continue
+        w = chr(key)
+        tokens: str = g2p(w)
+        lexicon.append((w, tokens))
+
+    for key in phrases:
+        tokens: str = g2p(key)
+        lexicon.append((key, tokens))
+    return lexicon
+
+
+def save(filename: str, lexicon: List[Tuple[str, str]]):
+    with open(filename, "w", encoding="utf-8") as f:
+        for word, phones in lexicon:
+            tokens = " ".join(list(phones))
+            f.write(f"{word} {tokens}\n")
+
+
+def main():
+    us = generate_english_lexicon("us")
+    gb = generate_english_lexicon("gb")
+    zh = generate_chinese_lexicon()
+
+    save("lexicon-us-en.txt", us)
+    save("lexicon-gb-en.txt", gb)
+    save("lexicon-zh.txt", zh)
+
+
+if __name__ == "__main__":
+    main()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Introduction

		This directory is for kokoro v1.0