From 56962cc89222a88bfacf55f9a2b9d286b03053e9 Mon Sep 17 00:00:00 2001
From: PoTaTo-Mika <148920650+PoTaTo-Mika@users.noreply.github.com>
Date: Sat, 11 May 2024 20:32:10 +0800
Subject: [PATCH] update readme.md (#179)

---
 docs/en/finetune.md  | 39 +++++++++++++++++++++++++++++++++------
 docs/en/index.md     |  4 ++++
 docs/en/inference.md |  8 ++++----
 docs/zh/finetune.md  | 42 ++++++++++++++++++++++++++++++++++--------
 docs/zh/index.md     |  5 +++++
 docs/zh/inference.md | 10 +++++-----
 6 files changed, 85 insertions(+), 23 deletions(-)

diff --git a/docs/en/finetune.md b/docs/en/finetune.md
index 72db1289..d433b905 100644
--- a/docs/en/finetune.md
+++ b/docs/en/finetune.md
@@ -2,7 +2,7 @@
 
 Obviously, when you opened this page, you were not satisfied with the performance of the few-shot pre-trained model. You want to fine-tune a model to improve its performance on your dataset.
 
-`Fish Speech` consists of two modules: `VQGAN` and `LLAMA`.
+`Fish Speech` consists of three modules: `VQGAN`, `LLAMA`and `VITS`.
 
 !!! info 
     You should first conduct the following test to determine if you need to fine-tune `VQGAN`:
@@ -12,6 +12,8 @@ Obviously, when you opened this page, you were not satisfied with the performanc
     This test will generate a `fake.wav` file. If the timbre of this file differs from the speaker's original voice, or if the quality is not high, you need to fine-tune `VQGAN`.
 
     Similarly, you can refer to [Inference](inference.md) to run `generate.py` and evaluate if the prosody meets your expectations. If it does not, then you need to fine-tune `LLAMA`.
+	
+    It is recommended to fine-tune the LLAMA and VITS model first, then fine-tune the `VQGAN` according to your needs.
 
 ## Fine-tuning VQGAN
 ### 1. Prepare the Dataset
@@ -140,15 +142,12 @@ python tools/llama/build_dataset.py \
 
 After the command finishes executing, you should see the `quantized-dataset-ft.protos` file in the `data` directory.
 
-!!!info
-    For the VITS format, you can specify a file list using `--input xxx.list`.
-
 ### 4. Finally, start the fine-tuning
 
 Similarly, make sure you have downloaded the `LLAMA` weights. If not, run the following command:
 
 ```bash
-huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-medium-v1-4k.pth --local-dir checkpoints
+huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-medium-v1.1-4k.pth --local-dir checkpoints
 ```
 
 Finally, you can start the fine-tuning by running the following command:
@@ -169,6 +168,34 @@ After training is complete, you can refer to the [inference](inference.md) secti
     By default, the model will only learn the speaker's speech patterns and not the timbre. You still need to use prompts to ensure timbre stability.
     If you want to learn the timbre, you can increase the number of training steps, but this may lead to overfitting.
 
+## Fine-tuning VITS
+### 1. Prepare the dataset
+
+```
+.
+├── SPK1
+│   ├── 21.15-26.44.lab
+│   ├── 21.15-26.44.mp3
+│   ├── 27.51-29.98.lab
+│   ├── 27.51-29.98.mp3
+│   ├── 30.1-32.71.lab
+│   └── 30.1-32.71.mp3
+└── SPK2
+    ├── 38.79-40.85.lab
+    └── 38.79-40.85.mp3
+```
+!!! note
+	The fine-tuning for VITS only support the .lab format files, please don't use .list file!
+
+You need to convert the dataset to the format above, and move them to the `data` , the suffix of the files can be `.mp3`, `.wav` 或 `.flac`, the label files' suffix are recommended to be  `.lab`.
+
+### 2.Start Training
+
+```bash
+python fish_speech/train.py --config-name vits_decoder_finetune
+```
+
+
 #### Fine-tuning with LoRA
 
 !!! note
@@ -182,7 +209,7 @@ After training, you need to convert the LoRA weights to regular weights before p
 python tools/llama/merge_lora.py \
     --llama-config dual_ar_2_codebook_medium \
     --lora-config r_8_alpha_16 \
-    --llama-weight checkpoints/text2semantic-sft-medium-v1-4k.pth \
+    --llama-weight checkpoints/text2semantic-sft-medium-v1.1-4k.pth \
     --lora-weight results/text2semantic-finetune-medium-lora/checkpoints/step_000000200.ckpt \
     --output checkpoints/merged.ckpt
 ```
diff --git a/docs/en/index.md b/docs/en/index.md
index f4f6b510..6ec6f816 100644
--- a/docs/en/index.md
+++ b/docs/en/index.md
@@ -38,10 +38,14 @@ pip3 install torch torchvision torchaudio
 
 # Install fish-speech
 pip3 install -e .
+
+#install sox
+apt install libsox-dev
 ```
 
 ## Changelog
 
+- 2024/05/10: Updated Fish-Speech to 1.1 version, importing VITS as the Decoder part.
 - 2024/04/22: Finished Fish-Speech 1.0 version, significantly modified VQGAN and LLAMA models.
 - 2023/12/28: Added `lora` fine-tuning support.
 - 2023/12/27: Add `gradient checkpointing`, `causual sampling`, and `flash-attn` support.
diff --git a/docs/en/inference.md b/docs/en/inference.md
index 32973b0e..b0bd27b6 100644
--- a/docs/en/inference.md
+++ b/docs/en/inference.md
@@ -16,7 +16,7 @@ Download the required `vqgan` and `text2semantic` models from our Hugging Face r
     
 ```bash
 huggingface-cli download fishaudio/fish-speech-1 vq-gan-group-fsq-2x1024.pth --local-dir checkpoints
-huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-medium-v1-4k.pth --local-dir checkpoints
+huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-medium-v1.1-4k.pth --local-dir checkpoints
 ```
 
 ### 1. Generate prompt from voice:
@@ -38,7 +38,7 @@ python tools/llama/generate.py \
     --prompt-text "Your reference text" \
     --prompt-tokens "fake.npy" \
     --config-name dual_ar_2_codebook_medium \
-    --checkpoint-path "checkpoints/text2semantic-sft-medium-v1-4k.pth" \
+    --checkpoint-path "checkpoints/text2semantic-sft-medium-v1.1-4k.pth" \
     --num-samples 2 \
     --compile
 ```
@@ -69,7 +69,7 @@ We provide a HTTP API for inference. You can use the following command to start
 ```bash
 python -m tools.api \
     --listen 0.0.0.0:8000 \
-    --llama-checkpoint-path "checkpoints/text2semantic-sft-medium-v1-4k.pth" \
+    --llama-checkpoint-path "checkpoints/text2semantic-sft-medium-v1.1-4k.pth" \
     --llama-config-name dual_ar_2_codebook_medium \
     --vqgan-checkpoint-path "checkpoints/vq-gan-group-fsq-2x1024.pth"
 ```
@@ -82,7 +82,7 @@ You can start the WebUI using the following command:
 
 ```bash
 python -m tools.webui \
-    --llama-checkpoint-path "checkpoints/text2semantic-sft-medium-v1-4k.pth" \
+    --llama-checkpoint-path "checkpoints/text2semantic-sft-medium-v1.1-4k.pth" \
     --llama-config-name dual_ar_2_codebook_medium \
     --vqgan-checkpoint-path "checkpoints/vq-gan-group-fsq-2x1024.pth"
 ```
diff --git a/docs/zh/finetune.md b/docs/zh/finetune.md
index 213fa83e..25bcc138 100644
--- a/docs/zh/finetune.md
+++ b/docs/zh/finetune.md
@@ -2,7 +2,7 @@
 
 显然, 当你打开这个页面的时候, 你已经对预训练模型 few-shot 的效果不算满意. 你想要微调一个模型, 使得它在你的数据集上表现更好.  
 
-`Fish Speech` 由两个模块组成: `VQGAN` 和 `LLAMA`. 
+`Fish Speech` 由三个模块组成: `VQGAN`,`LLAMA`和`VITS`. 
 
 !!! info 
     你应该先进行如下测试来判断你是否需要微调 `VQGAN`:
@@ -13,7 +13,9 @@
 
     相应的, 你可以参考 [推理](inference.md) 来运行 `generate.py`, 判断韵律是否满意, 如果不满意, 则需要微调 `LLAMA`.
 
-## VQGAN 微调
+    建议先对LLAMA以及VITS进行微调，最后再根据需要微调 `VQGAN `.
+
+## VQGAN 微调（如果对推理音频不满意再微调）
 ### 1. 准备数据集
 
 ```
@@ -110,7 +112,6 @@ python tools/vqgan/extract_vq.py data \
 
 !!! note
     你可以调整 `--num-workers` 和 `--batch-size` 来提高提取速度, 但是请注意不要超过你的显存限制.  
-    对于 VITS 格式, 你可以使用 `--filelist xxx.list` 来指定文件列表.
 
 该命令会在 `data` 目录下创建 `.npy` 文件, 如下所示:
 
@@ -144,21 +145,19 @@ python tools/llama/build_dataset.py \
 
 命令执行完毕后, 你应该能在 `data` 目录下看到 `quantized-dataset-ft.protos` 文件.
 
-!!! note
-    对于 VITS 格式, 你可以使用 `--input xxx.list` 来指定文件列表.
 
 ### 4. 最后, 启动微调
 
 同样的, 请确保你已经下载了 `LLAMA` 权重, 如果没有, 请运行以下命令:
 
 ```bash
-huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-medium-v1-4k.pth --local-dir checkpoints
+huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-medium-v1.1-4k.pth --local-dir checkpoints
 ```
 
 对于中国大陆用户, 可使用 mirror 下载.
 
 ```bash
-HF_ENDPOINT=https://hf-mirror.com huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-medium-v1-4k.pth --local-dir checkpoints
+HF_ENDPOINT=https://hf-mirror.com huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-medium-v1.1-4k.pth --local-dir checkpoints
 ```
 
 最后, 你可以运行以下命令来启动微调:
@@ -180,6 +179,33 @@ python fish_speech/train.py --config-name text2semantic_finetune \
     默认配置下, 基本只会学到说话人的发音方式, 而不包含音色, 你依然需要使用 prompt 来保证音色的稳定性.  
     如果你想要学到音色, 请将训练步数调大, 但这有可能会导致过拟合.
 
+## VITS微调
+### 1. 准备数据集
+
+```
+.
+├── SPK1
+│   ├── 21.15-26.44.lab
+│   ├── 21.15-26.44.mp3
+│   ├── 27.51-29.98.lab
+│   ├── 27.51-29.98.mp3
+│   ├── 30.1-32.71.lab
+│   └── 30.1-32.71.mp3
+└── SPK2
+    ├── 38.79-40.85.lab
+    └── 38.79-40.85.mp3
+```
+!!! note
+	VITS微调目前仅支持.lab作为标签文件，不支持filelist形式！
+
+你需要将数据集转为以上格式, 并放到 `data` 下, 音频后缀可以为 `.mp3`, `.wav` 或 `.flac`, 标注文件后缀建议为 `.lab`.
+
+### 2.启动训练
+
+```bash
+python fish_speech/train.py --config-name vits_decoder_finetune
+```
+
 #### 使用 lora 进行微调
 !!! note
     lora 可以减少模型过拟合的风险, 但是相应的会导致在大数据集上欠拟合.   
@@ -192,7 +218,7 @@ python fish_speech/train.py --config-name text2semantic_finetune \
 python tools/llama/merge_lora.py \
     --llama-config dual_ar_2_codebook_medium \
     --lora-config r_8_alpha_16 \
-    --llama-weight checkpoints/text2semantic-sft-medium-v1-4k.pth \
+    --llama-weight checkpoints/text2semantic-sft-medium-v1.1-4k.pth \
     --lora-weight results/text2semantic-finetune-medium-lora/checkpoints/step_000000200.ckpt \
     --output checkpoints/merged.ckpt
 ```
diff --git a/docs/zh/index.md b/docs/zh/index.md
index 40bf63ba..cc770d62 100644
--- a/docs/zh/index.md
+++ b/docs/zh/index.md
@@ -38,10 +38,15 @@ pip3 install torch torchvision torchaudio
 
 # 安装 fish-speech
 pip3 install -e .
+
+# 安装 sox
+apt install libsox-dev
 ```
 
+
 ## 更新日志
 
+- 2024/05/10: 更新了 Fish-Speech 到 1.1 版本，引入了 VITS 作为Decoder部分.
 - 2024/04/22: 完成了 Fish-Speech 1.0 版本, 大幅修改了 VQGAN 和 LLAMA 模型.
 - 2023/12/28: 添加了 `lora` 微调支持.
 - 2023/12/27: 添加了 `gradient checkpointing`, `causual sampling` 和 `flash-attn` 支持.
diff --git a/docs/zh/inference.md b/docs/zh/inference.md
index 3b671422..3dc0ed94 100644
--- a/docs/zh/inference.md
+++ b/docs/zh/inference.md
@@ -16,12 +16,12 @@
     
 ```bash
 huggingface-cli download fishaudio/fish-speech-1 vq-gan-group-fsq-2x1024.pth --local-dir checkpoints
-huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-medium-v1-4k.pth --local-dir checkpoints
+huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-medium-v1.1-4k.pth --local-dir checkpoints
 ```
 对于中国大陆用户，可使用mirror下载。
 ```bash
 HF_ENDPOINT=https://hf-mirror.com huggingface-cli download fishaudio/fish-speech-1 vq-gan-group-fsq-2x1024.pth --local-dir checkpoints
-HF_ENDPOINT=https://hf-mirror.com huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-medium-v1-4k.pth --local-dir checkpoints
+HF_ENDPOINT=https://hf-mirror.com huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-medium-v1.1-4k.pth --local-dir checkpoints
 ```
 
 ### 1. 从语音生成 prompt: 
@@ -43,7 +43,7 @@ python tools/llama/generate.py \
     --prompt-text "你的参考文本" \
     --prompt-tokens "fake.npy" \
     --config-name dual_ar_2_codebook_medium \
-    --checkpoint-path "checkpoints/text2semantic-sft-medium-v1-4k.pth" \
+    --checkpoint-path "checkpoints/text2semantic-sft-medium-v1.1-4k.pth" \
     --num-samples 2 \
     --compile
 ```
@@ -74,7 +74,7 @@ python tools/vqgan/inference.py \
 ```bash
 python -m tools.api \
     --listen 0.0.0.0:8000 \
-    --llama-checkpoint-path "checkpoints/text2semantic-sft-medium-v1-4k.pth" \
+    --llama-checkpoint-path "checkpoints/text2semantic-sft-medium-v1.1-4k.pth" \
     --llama-config-name dual_ar_2_codebook_medium \
     --vqgan-checkpoint-path "checkpoints/vq-gan-group-fsq-2x1024.pth"
 
@@ -90,7 +90,7 @@ HF_ENDPOINT=https://hf-mirror.com python -m ...
 
 ```bash
 python -m tools.webui \
-    --llama-checkpoint-path "checkpoints/text2semantic-sft-medium-v1-4k.pth" \
+    --llama-checkpoint-path "checkpoints/text2semantic-sft-medium-v1.1-4k.pth" \
     --llama-config-name dual_ar_2_codebook_medium \
     --vqgan-checkpoint-path "checkpoints/vq-gan-group-fsq-2x1024.pth"
 ```