From 1283b7695244c9df95a72815475cf97e79ae0668 Mon Sep 17 00:00:00 2001 From: Hongji Wang Date: Sat, 31 Aug 2024 01:21:03 +0800 Subject: [PATCH 1/2] [recipe] fix errors in voxceleb/v1/Whisper-PMFA --- examples/voxceleb/v1/Whisper-PMFA/README.md | 2 +- .../Whisper-PMFA/conf/whisper_PMFA_stage0.yaml | 6 +++--- .../Whisper-PMFA/conf/whisper_PMFA_stage1.yaml | 6 +++--- .../voxceleb/v1/Whisper-PMFA/local/score.sh | 2 +- .../v1/Whisper-PMFA/local/score_norm.sh | 2 +- examples/voxceleb/v1/Whisper-PMFA/run.sh | 17 +++++++++-------- examples/voxceleb/v1/Whisper-PMFA/tools | 2 +- examples/voxceleb/v1/Whisper-PMFA/wespeaker | 2 +- wespeaker/bin/train.py | 2 -- 9 files changed, 20 insertions(+), 21 deletions(-) diff --git a/examples/voxceleb/v1/Whisper-PMFA/README.md b/examples/voxceleb/v1/Whisper-PMFA/README.md index 88af1615..fabce8ba 100644 --- a/examples/voxceleb/v1/Whisper-PMFA/README.md +++ b/examples/voxceleb/v1/Whisper-PMFA/README.md @@ -20,5 +20,5 @@ | | √ | 6.63M | 1.88 | | Whisper-PMFA | × | 478.7M | 1.62 | | | √ | 478.7M | **1.42** | -| Whisper-PMFA with LoRa (Coming soon) | √ | 10.9M | 1.62 | +| Whisper-PMFA with LoRA (Coming soon) | √ | 10.9M | 1.62 | diff --git a/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage0.yaml b/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage0.yaml index 47b875bb..8a5f360d 100644 --- a/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage0.yaml +++ b/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage0.yaml @@ -1,8 +1,8 @@ ### train configuraton -exp_dir: exp/test +exp_dir: exp/Whisper_PMFA_large_v2_voxceleb1_mel_5s gpus: "[0,1]" -num_avg: 10 +num_avg: 1 enable_amp: False # whether enable automatic mixed precision training seed: 42 @@ -57,7 +57,7 @@ margin_update: initial_margin: 0.2 final_margin: 0.2 increase_start_epoch: 0 - fix_start_epoch: 30 + fix_start_epoch: 4 update_margin: True increase_type: "exp" # exp, linear diff --git a/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage1.yaml b/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage1.yaml index aa936979..738525bb 100644 --- a/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage1.yaml +++ b/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage1.yaml @@ -1,8 +1,8 @@ ### train configuraton -exp_dir: exp/test +exp_dir: exp/Whisper_PMFA_large_v2_voxceleb1_mel_5s gpus: "[0,1]" -num_avg: 10 +num_avg: 1 enable_amp: False # whether enable automatic mixed precision training seed: 42 @@ -56,7 +56,7 @@ margin_update: initial_margin: 0.2 final_margin: 0.2 increase_start_epoch: 0 - fix_start_epoch: 30 + fix_start_epoch: 8 update_margin: True increase_type: "exp" # exp, linear diff --git a/examples/voxceleb/v1/Whisper-PMFA/local/score.sh b/examples/voxceleb/v1/Whisper-PMFA/local/score.sh index 5b81a883..b4f89129 100755 --- a/examples/voxceleb/v1/Whisper-PMFA/local/score.sh +++ b/examples/voxceleb/v1/Whisper-PMFA/local/score.sh @@ -44,7 +44,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then scores_dir=${exp_dir}/scores for x in $trials; do python wespeaker/bin/compute_metrics.py \ - --p_target 0.01 \ + --p_target 0.05 \ --c_fa 1 \ --c_miss 1 \ ${scores_dir}/${x}.score \ diff --git a/examples/voxceleb/v1/Whisper-PMFA/local/score_norm.sh b/examples/voxceleb/v1/Whisper-PMFA/local/score_norm.sh index 73431093..48a4b6c5 100755 --- a/examples/voxceleb/v1/Whisper-PMFA/local/score_norm.sh +++ b/examples/voxceleb/v1/Whisper-PMFA/local/score_norm.sh @@ -57,7 +57,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then for x in ${trials}; do scores_dir=${exp_dir}/scores python wespeaker/bin/compute_metrics.py \ - --p_target 0.01 \ + --p_target 0.05 \ --c_fa 1 \ --c_miss 1 \ ${scores_dir}/${output_name}_${x}.score \ diff --git a/examples/voxceleb/v1/Whisper-PMFA/run.sh b/examples/voxceleb/v1/Whisper-PMFA/run.sh index 42b63023..577423f9 100644 --- a/examples/voxceleb/v1/Whisper-PMFA/run.sh +++ b/examples/voxceleb/v1/Whisper-PMFA/run.sh @@ -1,22 +1,20 @@ #!/bin/bash -# Copyright 2022 Hongji Wang (jijijiang77@gmail.com) -# 2022 Chengdong Liang (liangchengdong@mail.nwpu.edu.cn) -# 2022 Zhengyang Chen (chenzhengyang117@gmail.com) +# Copyright 2024 Yiyang Zhao (zhaoyy22@mails.tsinghua.edu.cn) +# 2024 Hongji Wang (jijijiang77@gmail.com) . ./path.sh || exit 1 -stage=3 -stop_stage=3 +stage=-1 +stop_stage=-1 data=data data_type="raw" # shard/raw model=whisper_PMFA_large_v2 exp_dir=exp/Whisper_PMFA_large_v2_voxceleb1_mel_5s - -gpus="[0]" -num_avg=10 +gpus="[0,1]" +num_avg=1 checkpoint= trials="vox1_O_cleaned.kaldi" @@ -25,6 +23,9 @@ score_norm_method="asnorm" # asnorm/snorm top_n=300 . tools/parse_options.sh || exit 1 +if ! pip show openai-whisper > /dev/null 2>&1; then + pip install openai-whisper==20231117 +fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then echo "Preparing datasets ..." diff --git a/examples/voxceleb/v1/Whisper-PMFA/tools b/examples/voxceleb/v1/Whisper-PMFA/tools index 8a51cc50..7c5e8a15 120000 --- a/examples/voxceleb/v1/Whisper-PMFA/tools +++ b/examples/voxceleb/v1/Whisper-PMFA/tools @@ -1 +1 @@ -../../../../tools +../../../../tools \ No newline at end of file diff --git a/examples/voxceleb/v1/Whisper-PMFA/wespeaker b/examples/voxceleb/v1/Whisper-PMFA/wespeaker index b7f7ab8b..4ab9d90a 120000 --- a/examples/voxceleb/v1/Whisper-PMFA/wespeaker +++ b/examples/voxceleb/v1/Whisper-PMFA/wespeaker @@ -1 +1 @@ -../../../../wespeaker +../../../../wespeaker \ No newline at end of file diff --git a/wespeaker/bin/train.py b/wespeaker/bin/train.py index d4e440be..63fb99bc 100644 --- a/wespeaker/bin/train.py +++ b/wespeaker/bin/train.py @@ -107,7 +107,6 @@ def train(config='conf/config.yaml', **kwargs): # model: frontend (optional) => speaker model => projection layer logger.info("<== Model ==>") - # frontend: fbank or s3prl frontend_type = configs['dataset_args'].get('frontend', 'fbank') if frontend_type != "fbank": frontend_args = frontend_type + "_args" @@ -119,7 +118,6 @@ def train(config='conf/config.yaml', **kwargs): model.add_module("frontend", frontend) else: model = get_speaker_model(configs['model'])(**configs['model_args']) - if rank == 0: num_params = sum(param.numel() for param in model.parameters()) logger.info('speaker_model size: {}'.format(num_params)) From 96eb76bd34dbdaa58609df86857b7694515dd66c Mon Sep 17 00:00:00 2001 From: Hongji Wang Date: Sat, 31 Aug 2024 01:35:17 +0800 Subject: [PATCH 2/2] [docs] update README.md for the new recipe voxceleb/v1/Whisper-PMFA --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 690e93e8..0ff69681 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ pre-commit install # for clean and tidy code ``` ## 🔥 News +* 2024.08.30: We support whisper_encoder based frontend and propose the [Whisper-PMFA](https://arxiv.org/pdf/2408.15585) framework, check [#356](https://github.com/wenet-e2e/wespeaker/pull/356). * 2024.08.20: Update diarization recipe for VoxConverse dataset by leveraging umap dimensionality reduction and hdbscan clustering, see [#347](https://github.com/wenet-e2e/wespeaker/pull/347) and [#352](https://github.com/wenet-e2e/wespeaker/pull/352). * 2024.08.18: Support using ssl pre-trained models as the frontend. The [WavLM recipe](https://github.com/wenet-e2e/wespeaker/blob/master/examples/voxceleb/v2/run_wavlm.sh) is also provided, see [#344](https://github.com/wenet-e2e/wespeaker/pull/344). * 2024.05.15: Add support for [quality-aware score calibration](https://arxiv.org/pdf/2211.00815), see [#320](https://github.com/wenet-e2e/wespeaker/pull/320).