wenet-e2e · Aurora1818 · Aug 31, 2024 · Aug 30, 2024 · Aug 30, 2024
diff --git a/README.md b/README.md
@@ -60,6 +60,7 @@ pre-commit install  # for clean and tidy code
 ```
 
 ## 🔥 News
+* 2024.08.30: We support whisper_encoder based frontend and propose the [Whisper-PMFA](https://arxiv.org/pdf/2408.15585) framework, check [#356](https://github.com/wenet-e2e/wespeaker/pull/356).
 * 2024.08.20: Update diarization recipe for VoxConverse dataset by leveraging umap dimensionality reduction and hdbscan clustering, see [#347](https://github.com/wenet-e2e/wespeaker/pull/347) and [#352](https://github.com/wenet-e2e/wespeaker/pull/352).
 * 2024.08.18: Support using ssl pre-trained models as the frontend. The [WavLM recipe](https://github.com/wenet-e2e/wespeaker/blob/master/examples/voxceleb/v2/run_wavlm.sh) is also provided, see [#344](https://github.com/wenet-e2e/wespeaker/pull/344).
 * 2024.05.15: Add support for [quality-aware score calibration](https://arxiv.org/pdf/2211.00815), see [#320](https://github.com/wenet-e2e/wespeaker/pull/320).

diff --git a/examples/voxceleb/v1/Whisper-PMFA/README.md b/examples/voxceleb/v1/Whisper-PMFA/README.md
@@ -20,5 +20,5 @@
 |                                      | √       | 6.63M  |     1.88     |
 | Whisper-PMFA                         | ×       | 478.7M |     1.62     |
 |                                      | √       | 478.7M |   **1.42**   |
-| Whisper-PMFA with LoRa (Coming soon) | √       | 10.9M  |     1.62     |
+| Whisper-PMFA with LoRA (Coming soon) | √       | 10.9M  |     1.62     |
 
diff --git a/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage0.yaml b/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage0.yaml
@@ -1,8 +1,8 @@
 ### train configuraton
 
-exp_dir: exp/test
+exp_dir: exp/Whisper_PMFA_large_v2_voxceleb1_mel_5s
 gpus: "[0,1]"
-num_avg: 10
+num_avg: 1
 enable_amp: False # whether enable automatic mixed precision training
 
 seed: 42
@@ -57,7 +57,7 @@ margin_update:
   initial_margin: 0.2
   final_margin: 0.2
   increase_start_epoch: 0
-  fix_start_epoch: 30
+  fix_start_epoch: 4
   update_margin: True
   increase_type: "exp" # exp, linear
 

diff --git a/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage1.yaml b/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage1.yaml
@@ -1,8 +1,8 @@
 ### train configuraton
 
-exp_dir: exp/test
+exp_dir: exp/Whisper_PMFA_large_v2_voxceleb1_mel_5s
 gpus: "[0,1]"
-num_avg: 10
+num_avg: 1
 enable_amp: False # whether enable automatic mixed precision training
 
 seed: 42
@@ -56,7 +56,7 @@ margin_update:
   initial_margin: 0.2
   final_margin: 0.2
   increase_start_epoch: 0
-  fix_start_epoch: 30
+  fix_start_epoch: 8
   update_margin: True
   increase_type: "exp" # exp, linear
 

diff --git a/examples/voxceleb/v1/Whisper-PMFA/local/score.sh b/examples/voxceleb/v1/Whisper-PMFA/local/score.sh
@@ -44,7 +44,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
   scores_dir=${exp_dir}/scores
   for x in $trials; do
     python wespeaker/bin/compute_metrics.py \
-        --p_target 0.01 \
+        --p_target 0.05 \
         --c_fa 1 \
         --c_miss 1 \
         ${scores_dir}/${x}.score \

diff --git a/examples/voxceleb/v1/Whisper-PMFA/local/score_norm.sh b/examples/voxceleb/v1/Whisper-PMFA/local/score_norm.sh
@@ -57,7 +57,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
   for x in ${trials}; do
     scores_dir=${exp_dir}/scores
     python wespeaker/bin/compute_metrics.py \
-      --p_target 0.01 \
+      --p_target 0.05 \
       --c_fa 1 \
       --c_miss 1 \
       ${scores_dir}/${output_name}_${x}.score \

diff --git a/examples/voxceleb/v1/Whisper-PMFA/run.sh b/examples/voxceleb/v1/Whisper-PMFA/run.sh
@@ -1,22 +1,20 @@
 #!/bin/bash
 
-# Copyright 2022 Hongji Wang (jijijiang77@gmail.com)
-#           2022 Chengdong Liang (liangchengdong@mail.nwpu.edu.cn)
-#           2022 Zhengyang Chen (chenzhengyang117@gmail.com)
+# Copyright 2024 Yiyang Zhao (zhaoyy22@mails.tsinghua.edu.cn)
+#           2024 Hongji Wang (jijijiang77@gmail.com)
 
 . ./path.sh || exit 1
 
-stage=3
-stop_stage=3
+stage=-1
+stop_stage=-1
 
 data=data
 data_type="raw"  # shard/raw
 model=whisper_PMFA_large_v2
 
 exp_dir=exp/Whisper_PMFA_large_v2_voxceleb1_mel_5s
-
-gpus="[0]"
-num_avg=10
+gpus="[0,1]"
+num_avg=1
 checkpoint=
 
 trials="vox1_O_cleaned.kaldi"
@@ -25,6 +23,9 @@ score_norm_method="asnorm"  # asnorm/snorm
 top_n=300
 
 . tools/parse_options.sh || exit 1
+if ! pip show openai-whisper > /dev/null 2>&1; then
+    pip install openai-whisper==20231117
+fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
   echo "Preparing datasets ..."

diff --git a/examples/voxceleb/v1/Whisper-PMFA/tools b/examples/voxceleb/v1/Whisper-PMFA/tools
@@ -1 +1 @@
-../../../../tools
+../../../../tools
diff --git a/examples/voxceleb/v1/Whisper-PMFA/wespeaker b/examples/voxceleb/v1/Whisper-PMFA/wespeaker
@@ -1 +1 @@
-../../../../wespeaker
+../../../../wespeaker
diff --git a/wespeaker/bin/train.py b/wespeaker/bin/train.py
@@ -107,7 +107,6 @@ def train(config='conf/config.yaml', **kwargs):
 
     # model: frontend (optional) => speaker model => projection layer
     logger.info("<== Model ==>")
-    # frontend: fbank or s3prl
     frontend_type = configs['dataset_args'].get('frontend', 'fbank')
     if frontend_type != "fbank":
         frontend_args = frontend_type + "_args"
@@ -119,7 +118,6 @@ def train(config='conf/config.yaml', **kwargs):
         model.add_module("frontend", frontend)
     else:
         model = get_speaker_model(configs['model'])(**configs['model_args'])
-
     if rank == 0:
         num_params = sum(param.numel() for param in model.parameters())
         logger.info('speaker_model size: {}'.format(num_params))