From 167066292eceb12b1604cc9d6581b766ad061b09 Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Fri, 13 Nov 2020 16:13:10 -0500 Subject: [PATCH] [egs] LibriCSS recipe (#4321) Refer to the README.md to each eg directory for description. --- .../v1/diarization/vb_hmm_xvector.py | 89 +------ .../v1/diarization/vb_hmm_xvector.sh | 23 +- egs/libri_css/README.md | 63 +++++ egs/libri_css/s5_css/cmd.sh | 14 + egs/libri_css/s5_css/conf/mfcc.conf | 2 + egs/libri_css/s5_css/conf/mfcc_hires.conf | 10 + egs/libri_css/s5_css/conf/online_cmvn.conf | 1 + egs/libri_css/s5_css/diarization | 1 + egs/libri_css/s5_css/local | 1 + egs/libri_css/s5_css/path.sh | 9 + egs/libri_css/s5_css/rnnlm | 1 + egs/libri_css/s5_css/run.sh | 243 ++++++++++++++++++ egs/libri_css/s5_css/sid | 1 + egs/libri_css/s5_css/steps | 1 + egs/libri_css/s5_css/utils | 1 + egs/libri_css/s5_mono/cmd.sh | 14 + egs/libri_css/s5_mono/conf/mfcc.conf | 2 + egs/libri_css/s5_mono/conf/mfcc_hires.conf | 10 + egs/libri_css/s5_mono/conf/online_cmvn.conf | 1 + egs/libri_css/s5_mono/diarization | 1 + .../s5_mono/local/best_wer_matching.py | 72 ++++++ .../s5_mono/local/chain/run_chain_common.sh | 80 ++++++ egs/libri_css/s5_mono/local/chain/run_tdnn.sh | 1 + .../s5_mono/local/chain/tuning/run_tdnn_1d.sh | 171 ++++++++++++ .../local/chain/tuning/run_tdnn_1d_ft.sh | 241 +++++++++++++++++ .../convert_rttm_to_utt2spk_and_segments.py | 98 +++++++ egs/libri_css/s5_mono/local/data_prep_css.sh | 94 +++++++ .../s5_mono/local/data_prep_librispeech.sh | 79 ++++++ egs/libri_css/s5_mono/local/data_prep_mono.sh | 89 +++++++ egs/libri_css/s5_mono/local/decode.sh | 227 ++++++++++++++++ .../s5_mono/local/decode_diarized.sh | 78 ++++++ .../s5_mono/local/decode_diarized_css.sh | 84 ++++++ egs/libri_css/s5_mono/local/decode_oracle.sh | 125 +++++++++ .../s5_mono/local/detect_speech_activity.sh | 225 ++++++++++++++++ .../diarization/post_process_css_rttm.py | 121 +++++++++ .../s5_mono/local/diarization/scluster.sh | 100 +++++++ egs/libri_css/s5_mono/local/diarize.sh | 87 +++++++ egs/libri_css/s5_mono/local/diarize_css.sh | 131 ++++++++++ .../s5_mono/local/diarize_spectral.sh | 81 ++++++ .../s5_mono/local/download_and_untar.sh | 100 +++++++ .../s5_mono/local/download_diarizer.sh | 38 +++ egs/libri_css/s5_mono/local/download_lm.sh | 76 ++++++ egs/libri_css/s5_mono/local/dscore.sh | 70 +++++ .../s5_mono/local/extract_vad_weights.sh | 86 +++++++ egs/libri_css/s5_mono/local/format_lms.sh | 60 +++++ .../s5_mono/local/get_perspeaker_output.py | 91 +++++++ egs/libri_css/s5_mono/local/make_voxceleb1.pl | 130 ++++++++++ egs/libri_css/s5_mono/local/make_voxceleb2.pl | 70 +++++ .../s5_mono/local/multispeaker_score.sh | 111 ++++++++ egs/libri_css/s5_mono/local/nnet3/decode.sh | 163 ++++++++++++ .../s5_mono/local/nnet3/run_ivector_common.sh | 149 +++++++++++ .../local/nnet3/xvector/prepare_feats.sh | 89 +++++++ .../nnet3/xvector/prepare_feats_for_egs.sh | 83 ++++++ .../local/nnet3/xvector/run_xvector.sh | 1 + .../nnet3/xvector/tuning/run_xvector_1a.sh | 149 +++++++++++ egs/libri_css/s5_mono/local/prepare_data.py | 104 ++++++++ .../s5_mono/local/prepare_data_css.py | 92 +++++++ egs/libri_css/s5_mono/local/prepare_dict.sh | 143 +++++++++++ egs/libri_css/s5_mono/local/rnnlm/train.sh | 1 + .../local/rnnlm/tuning/run_tdnn_lstm_1a.sh | 130 ++++++++++ .../s5_mono/local/run_cleanup_segmentation.sh | 63 +++++ egs/libri_css/s5_mono/local/score.sh | 1 + .../s5_mono/local/score_reco_diarized.sh | 147 +++++++++++ .../s5_mono/local/score_reco_oracle.sh | 107 ++++++++ .../local/segmentation/apply_webrtcvad.py | 212 +++++++++++++++ .../segmentation/detect_speech_activity.sh | 217 ++++++++++++++++ egs/libri_css/s5_mono/local/train_asr.sh | 205 +++++++++++++++ egs/libri_css/s5_mono/local/train_diarizer.sh | 186 ++++++++++++++ egs/libri_css/s5_mono/local/wer_output_filter | 25 ++ egs/libri_css/s5_mono/path.sh | 10 + egs/libri_css/s5_mono/rnnlm | 1 + egs/libri_css/s5_mono/run.sh | 99 +++++++ egs/libri_css/s5_mono/sid | 1 + egs/libri_css/s5_mono/steps | 1 + egs/libri_css/s5_mono/utils | 1 + 75 files changed, 5787 insertions(+), 97 deletions(-) create mode 100644 egs/libri_css/README.md create mode 100644 egs/libri_css/s5_css/cmd.sh create mode 100644 egs/libri_css/s5_css/conf/mfcc.conf create mode 100644 egs/libri_css/s5_css/conf/mfcc_hires.conf create mode 100644 egs/libri_css/s5_css/conf/online_cmvn.conf create mode 120000 egs/libri_css/s5_css/diarization create mode 120000 egs/libri_css/s5_css/local create mode 100644 egs/libri_css/s5_css/path.sh create mode 120000 egs/libri_css/s5_css/rnnlm create mode 100755 egs/libri_css/s5_css/run.sh create mode 120000 egs/libri_css/s5_css/sid create mode 120000 egs/libri_css/s5_css/steps create mode 120000 egs/libri_css/s5_css/utils create mode 100644 egs/libri_css/s5_mono/cmd.sh create mode 100644 egs/libri_css/s5_mono/conf/mfcc.conf create mode 100644 egs/libri_css/s5_mono/conf/mfcc_hires.conf create mode 100644 egs/libri_css/s5_mono/conf/online_cmvn.conf create mode 120000 egs/libri_css/s5_mono/diarization create mode 100755 egs/libri_css/s5_mono/local/best_wer_matching.py create mode 100755 egs/libri_css/s5_mono/local/chain/run_chain_common.sh create mode 120000 egs/libri_css/s5_mono/local/chain/run_tdnn.sh create mode 100755 egs/libri_css/s5_mono/local/chain/tuning/run_tdnn_1d.sh create mode 100755 egs/libri_css/s5_mono/local/chain/tuning/run_tdnn_1d_ft.sh create mode 100755 egs/libri_css/s5_mono/local/convert_rttm_to_utt2spk_and_segments.py create mode 100755 egs/libri_css/s5_mono/local/data_prep_css.sh create mode 100755 egs/libri_css/s5_mono/local/data_prep_librispeech.sh create mode 100755 egs/libri_css/s5_mono/local/data_prep_mono.sh create mode 100755 egs/libri_css/s5_mono/local/decode.sh create mode 100755 egs/libri_css/s5_mono/local/decode_diarized.sh create mode 100755 egs/libri_css/s5_mono/local/decode_diarized_css.sh create mode 100755 egs/libri_css/s5_mono/local/decode_oracle.sh create mode 100755 egs/libri_css/s5_mono/local/detect_speech_activity.sh create mode 100755 egs/libri_css/s5_mono/local/diarization/post_process_css_rttm.py create mode 100755 egs/libri_css/s5_mono/local/diarization/scluster.sh create mode 100755 egs/libri_css/s5_mono/local/diarize.sh create mode 100755 egs/libri_css/s5_mono/local/diarize_css.sh create mode 100755 egs/libri_css/s5_mono/local/diarize_spectral.sh create mode 100755 egs/libri_css/s5_mono/local/download_and_untar.sh create mode 100755 egs/libri_css/s5_mono/local/download_diarizer.sh create mode 100755 egs/libri_css/s5_mono/local/download_lm.sh create mode 100644 egs/libri_css/s5_mono/local/dscore.sh create mode 100755 egs/libri_css/s5_mono/local/extract_vad_weights.sh create mode 100755 egs/libri_css/s5_mono/local/format_lms.sh create mode 100755 egs/libri_css/s5_mono/local/get_perspeaker_output.py create mode 100755 egs/libri_css/s5_mono/local/make_voxceleb1.pl create mode 100755 egs/libri_css/s5_mono/local/make_voxceleb2.pl create mode 100755 egs/libri_css/s5_mono/local/multispeaker_score.sh create mode 100755 egs/libri_css/s5_mono/local/nnet3/decode.sh create mode 100755 egs/libri_css/s5_mono/local/nnet3/run_ivector_common.sh create mode 100755 egs/libri_css/s5_mono/local/nnet3/xvector/prepare_feats.sh create mode 100755 egs/libri_css/s5_mono/local/nnet3/xvector/prepare_feats_for_egs.sh create mode 120000 egs/libri_css/s5_mono/local/nnet3/xvector/run_xvector.sh create mode 100755 egs/libri_css/s5_mono/local/nnet3/xvector/tuning/run_xvector_1a.sh create mode 100755 egs/libri_css/s5_mono/local/prepare_data.py create mode 100755 egs/libri_css/s5_mono/local/prepare_data_css.py create mode 100755 egs/libri_css/s5_mono/local/prepare_dict.sh create mode 120000 egs/libri_css/s5_mono/local/rnnlm/train.sh create mode 100755 egs/libri_css/s5_mono/local/rnnlm/tuning/run_tdnn_lstm_1a.sh create mode 100755 egs/libri_css/s5_mono/local/run_cleanup_segmentation.sh create mode 120000 egs/libri_css/s5_mono/local/score.sh create mode 100755 egs/libri_css/s5_mono/local/score_reco_diarized.sh create mode 100755 egs/libri_css/s5_mono/local/score_reco_oracle.sh create mode 100755 egs/libri_css/s5_mono/local/segmentation/apply_webrtcvad.py create mode 100755 egs/libri_css/s5_mono/local/segmentation/detect_speech_activity.sh create mode 100755 egs/libri_css/s5_mono/local/train_asr.sh create mode 100755 egs/libri_css/s5_mono/local/train_diarizer.sh create mode 100755 egs/libri_css/s5_mono/local/wer_output_filter create mode 100644 egs/libri_css/s5_mono/path.sh create mode 120000 egs/libri_css/s5_mono/rnnlm create mode 100755 egs/libri_css/s5_mono/run.sh create mode 120000 egs/libri_css/s5_mono/sid create mode 120000 egs/libri_css/s5_mono/steps create mode 120000 egs/libri_css/s5_mono/utils diff --git a/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.py b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.py index 87625a29b25..a284abbeb4a 100644 --- a/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.py +++ b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Copyright 2020 Johns Hopkins University (Author: Desh Raj) # Apache 2.0 @@ -9,7 +9,7 @@ # vb_hmm_xvector.sh which can divide all labels into per recording # labels. -import sys, argparse, struct, re +import sys, argparse, struct import numpy as np import itertools import kaldi_io @@ -36,9 +36,6 @@ def get_args(): help="scale sufficient statistics collected using UBM") parser.add_argument("--fb", type=float, default=11, help="speaker regularization coefficient Fb (controls final # of speaker)") - parser.add_argument("--overlap-rttm", type=str, - help="path to an RTTM file containing overlap segments. If provided," - "multiple speaker labels will be allocated to these segments.") parser.add_argument("xvector_ark_file", type=str, help="Ark file containing xvectors for all subsegments") parser.add_argument("plda", type=str, @@ -61,59 +58,12 @@ def read_labels_file(label_file): return segments, labels def write_labels_file(seg2label, out_file): - f = open(out_file, 'w') - for seg in sorted(seg2label.keys()): - label = seg2label[seg] - if type(label) is tuple: - f.write("{} {}\n".format(seg, label[0])) - f.write("{} {}\n".format(seg, label[1])) - else: - f.write("{} {}\n".format(seg, label)) - f.close() + with open(out_file, 'w') as f: + for seg in sorted(seg2label.keys()): + label = seg2label[seg] + f.write(f"{seg} {label}\n") return -def get_overlap_decision(overlap_segs, subsegment, frac = 0.5): - """ Returns true if at least 'frac' fraction of the subsegment lies - in the overlap_segs.""" - start_time = subsegment[0] - end_time = subsegment[1] - dur = end_time - start_time - total_ovl = 0 - - for seg in overlap_segs: - cur_start, cur_end = seg - if (cur_start >= end_time): - break - ovl_start = max(start_time, cur_start) - ovl_end = min(end_time, cur_end) - ovl_time = max(0, ovl_end-ovl_start) - - total_ovl += ovl_time - - return (total_ovl >= frac * dur) - - -def get_overlap_vector(overlap_rttm, segments): - reco_id = '_'.join(segments[0].split('_')[:3]) - overlap_segs = [] - with open(overlap_rttm, 'r') as f: - for line in f.readlines(): - parts = line.strip().split() - if (parts[1] == reco_id): - overlap_segs.append((float(parts[3]), float(parts[3]) + float(parts[4]))) - ol_vec = np.zeros(len(segments)) - overlap_segs.sort(key=lambda x: x[0]) - for i, segment in enumerate(segments): - parts = re.split('_|-',segment) - start_time = (float(parts[3]) + float(parts[5]))/100 - end_time = (float(parts[3]) + float(parts[6]))/100 - - is_overlap = get_overlap_decision(overlap_segs, (start_time, end_time)) - if is_overlap: - ol_vec[i] = 1 - print ("{}: {} fraction of segments are overlapping".format(id, ol_vec.sum()/len(ol_vec))) - return ol_vec - def read_args(args): segments, labels = read_labels_file(args.input_label_file) xvec_all = dict(kaldi_io.read_vec_flt_ark(args.xvector_ark_file)) @@ -121,17 +71,12 @@ def read_args(args): for segment in segments: xvectors.append(xvec_all[segment]) _, _, plda_psi = kaldi_io.read_plda(args.plda) - if (args.overlap_rttm is not None): - print('Getting overlap segments...') - overlaps = get_overlap_vector(args.overlap_rttm, segments) - else: - overlaps = None - return xvectors, segments, labels, plda_psi, overlaps + return xvectors, segments, labels, plda_psi ################################################################### -def vb_hmm(segments, in_labels, xvectors, overlaps, plda_psi, init_smoothing, loop_prob, fa, fb): +def vb_hmm(segments, in_labels, xvectors, plda_psi, init_smoothing, loop_prob, fa, fb): x = np.array(xvectors) dim = x.shape[1] @@ -153,25 +98,15 @@ def vb_hmm(segments, in_labels, xvectors, overlaps, plda_psi, init_smoothing, lo gamma=q_init, maxSpeakers=q_init.shape[1], maxIters=40, epsilon=1e-6, loopProb=loop_prob, Fa=fa, Fb=fb) - labels = np.argsort(q, axis=1)[:,[-1,-2]] + labels = np.unique(q.argmax(1), return_inverse=True)[1] - if overlaps is not None: - final_labels = [] - for i in range(len(overlaps)): - if (overlaps[i] == 1): - final_labels.append((labels[i,0], labels[i,1])) - else: - final_labels.append(labels[i,0]) - else: - final_labels = labels[:,0] - - return {seg:label for seg,label in zip(segments,final_labels)} + return {seg:label for seg,label in zip(segments,labels)} def main(): args = get_args() - xvectors, segments, labels, plda_psi, overlaps = read_args(args) + xvectors, segments, labels, plda_psi = read_args(args) - seg2label_vb = vb_hmm(segments, labels, xvectors, overlaps, plda_psi, args.init_smoothing, + seg2label_vb = vb_hmm(segments, labels, xvectors, plda_psi, args.init_smoothing, args.loop_prob, args.fa, args.fb) write_labels_file(seg2label_vb, args.output_label_file) diff --git a/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh index 5badd747d5d..081219ff2a4 100755 --- a/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh +++ b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh @@ -13,7 +13,6 @@ stage=0 nj=10 cleanup=true rttm_channel=0 -overlap_rttm= # Path to an RTTM output of an external overlap detector # The hyperparameters used here are taken from the DIHARD # optimal hyperparameter values reported in: @@ -69,14 +68,6 @@ if [ "$result" == "0" ]; then python3 -m pip install numexpr fi -overlap_rttm_opt= -if ! [ -z "$overlap_rttm" ]; then - overlap_rttm_opt="--overlap-rttm $overlap_rttm" - rttm_bin="make_rttm_ol.py" -else - rttm_bin="make_rttm.py" -fi - if [ $stage -le 0 ]; then # Mean subtraction (If original x-vectors are high-dim, e.g. 512, you should # consider also applying LDA to reduce dimensionality to, say, 200) @@ -85,18 +76,10 @@ if [ $stage -le 0 ]; then fi echo -e "Performing bayesian HMM based x-vector clustering..\n" -# making a shell script for each job -for n in `seq $nj`; do - cat <<-EOF > $dir/tmp/vb_hmm.$n.sh - python3 diarization/vb_hmm_xvector.py $overlap_rttm_opt \ - --loop-prob $loop_prob --fa $fa --fb $fb \ - $xvec_dir/xvector_norm.ark $plda $dir/labels.$n $dir/labels.vb.$n -EOF -done - -chmod a+x $dir/tmp/vb_hmm.*.sh $cmd JOB=1:$nj $dir/log/vb_hmm.JOB.log \ - $dir/tmp/vb_hmm.JOB.sh + diarization/vb_hmm_xvector.py \ + --loop-prob $loop_prob --fa $fa --fb $fb \ + $xvec_dir/xvector_norm.ark $plda $dir/labels.JOB $dir/labels.vb.JOB if [ $stage -le 1 ]; then echo "$0: combining labels" diff --git a/egs/libri_css/README.md b/egs/libri_css/README.md new file mode 100644 index 00000000000..b0901d5865e --- /dev/null +++ b/egs/libri_css/README.md @@ -0,0 +1,63 @@ +### LibriCSS integrated recipe + +This is a Kaldi recipe for the LibriCSS data, providing diarization and +ASR on mixed single-channel and separated audio inputs. + +#### Data +We use the LibriCSS data released with the following paper: +``` +@article{Chen2020ContinuousSS, + title={Continuous Speech Separation: Dataset and Analysis}, + author={Z. Chen and T. Yoshioka and Liang Lu and T. Zhou and Zhong Meng and Yi Luo and J. Wu and J. Li}, + journal={ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, + year={2020} +} +``` +For the official data and code, check out [the official repo](https://github.com/chenzhuo1011/libri_css). + +#### Recipe details +This recipe addresses the problem of speech recognition in a meeting-like +scenario, where multiple overlapping speakers may be present, and the +number of speakers is not known beforehand. + +We provide recipes for 2 scenarios: +1. `s5_mono`: This is a single channel diarization + ASR recipe which takes as the +input a long single-channel recording containing mixed audio. It then performs SAD, +diarization, and ASR on it and outputs speaker-attributed transcriptions, +which are then evaluated with cpWER (similar to CHiME6 Track 2). +2. `s5_css`: This pipeline uses a speech separation module at the beginning, +so the input is 2-3 separated audio streams. We assume that the separation is +window-based, so that the same speaker may be split across different streams in +different windows, thus making diarization necessary. + +#### Pretrained models for diarization and ASR +For ease of reproduction, we include the training for both modules in the +recipe. We also provide pretrained models for both diarization and ASR +systems. + +* SAD: CHiME-6 baseline TDNN-Stats SAD available [here](http://kaldi-asr.org/models/m12). +* Speaker diarization: CHiME-6 baseline x-vector + AHC diarizer, trained on VoxCeleb +with simulated RIRs available [here](http://kaldi-asr.org/models/m12). +* ASR: We used the chain model trained on 960h clean LibriSpeech training data available +[here](http://kaldi-asr.org/models/m13). It was then additionally fine-tuned for 1 +epoch on LibriSpeech + simulated RIRs. For LM, we trained a TDNN-LSTM language model +for rescoring. All of these models are available at this +[Google Drive link](https://drive.google.com/file/d/13ceXdK6oAUuUyxn7kjQVVqpe8r6Sc7ds/view?usp=sharing). + +#### Speech separation +The speech separation module has not been provided. If you want to use the +`s5_css` recipe, check out [this tutorial](https://desh2608.github.io/pages/jsalt/) for +instructions on how to plug in your component into the pipeline. + +If you found this recipe useful for your experiments, consider citing: + +``` +@article{Raj2021Integration, + title={Integration of speech separation, diarization, and recognition for multi-speaker meetings: +System description, Comparison, and Analysis}, + author={D.Raj and P.Denisov and Z.Chen and H.Erdogan and Z.Huang and M.He and S.Watanabe and + J.Du and T.Yoshioka and Y.Luo and N.Kanda and J.Li and S.Wisdom and J.Hershey}, + journal={IEEE Spoken Language Technology Workshop 2021}, + year={2021} +} +``` \ No newline at end of file diff --git a/egs/libri_css/s5_css/cmd.sh b/egs/libri_css/s5_css/cmd.sh new file mode 100644 index 00000000000..86514d94d4d --- /dev/null +++ b/egs/libri_css/s5_css/cmd.sh @@ -0,0 +1,14 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="retry.pl queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" diff --git a/egs/libri_css/s5_css/conf/mfcc.conf b/egs/libri_css/s5_css/conf/mfcc.conf new file mode 100644 index 00000000000..32988403b00 --- /dev/null +++ b/egs/libri_css/s5_css/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false +--sample-frequency=16000 diff --git a/egs/libri_css/s5_css/conf/mfcc_hires.conf b/egs/libri_css/s5_css/conf/mfcc_hires.conf new file mode 100644 index 00000000000..fd64b62eb16 --- /dev/null +++ b/egs/libri_css/s5_css/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=16000 +--num-mel-bins=40 +--num-ceps=40 +--low-freq=40 +--high-freq=-400 diff --git a/egs/libri_css/s5_css/conf/online_cmvn.conf b/egs/libri_css/s5_css/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/libri_css/s5_css/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/libri_css/s5_css/diarization b/egs/libri_css/s5_css/diarization new file mode 120000 index 00000000000..bad937c1444 --- /dev/null +++ b/egs/libri_css/s5_css/diarization @@ -0,0 +1 @@ +../../callhome_diarization/v1/diarization \ No newline at end of file diff --git a/egs/libri_css/s5_css/local b/egs/libri_css/s5_css/local new file mode 120000 index 00000000000..2757f389a5b --- /dev/null +++ b/egs/libri_css/s5_css/local @@ -0,0 +1 @@ +../s5_mono/local \ No newline at end of file diff --git a/egs/libri_css/s5_css/path.sh b/egs/libri_css/s5_css/path.sh new file mode 100644 index 00000000000..2f4e4e4fb21 --- /dev/null +++ b/egs/libri_css/s5_css/path.sh @@ -0,0 +1,9 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH +export PATH=$PWD/dscore:$PATH +export PYTHONPATH="${PYTHONPATH}:$PWD/dscore" +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C + diff --git a/egs/libri_css/s5_css/rnnlm b/egs/libri_css/s5_css/rnnlm new file mode 120000 index 00000000000..72302c5e570 --- /dev/null +++ b/egs/libri_css/s5_css/rnnlm @@ -0,0 +1 @@ +../../../scripts/rnnlm \ No newline at end of file diff --git a/egs/libri_css/s5_css/run.sh b/egs/libri_css/s5_css/run.sh new file mode 100755 index 00000000000..6982983a448 --- /dev/null +++ b/egs/libri_css/s5_css/run.sh @@ -0,0 +1,243 @@ +#!/usr/bin/env bash +# +# LibriCSS pipeline containing speech separation. We don't provide +# any training stages for diarization or ASR here, since they are +# the same as those in s5_mono. As such, this run script is +# actually a decoding script. Before running this script, you +# need to have run your separation module (or use the separated +# audio streams we have provided), and the output streams must +# be named in the following naming convention: +# overlap_ratio_10.0_sil0.1_1.0_session7_actual10.1_channel_1.wav +# Here, "channel" denotes the stream number, for example, if your +# method separates the audio into 3 streams, they should be named +# channel_0, channel_1, and channel_2. The wav files can be organized +# in any hierarchy within the directory, but this naming +# convention must be followed. +# +# Copyright 2020 Johns Hopkins University (Author: Desh Raj) +# Apache 2.0 + +# Begin configuration section. +nj=50 +decode_nj=20 +stage=0 + +nnet3_affix=_cleaned +affix=1d_ft +data_affix= # This can be used to distinguish between different data sources +sad_type=tdnn # Set this to webrtc or tdnn + +# Different stages +sad_stage=0 +diarizer_stage=1 +decode_diarize_stage=0 +score_stage=0 +rnnlm_rescore=true + +# RNNLM rescore options +ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order + # if it's set, it merges histories in the lattice if they share + # the same ngram history and this prevents the lattice from + # exploding exponentially +pruned_rescore=true +rnnlm_dir=exp/rnnlm_lstm_1a + +set -e # exit on error + +# End configuration section +. ./utils/parse_options.sh + +. ./cmd.sh +. ./path.sh + +test_sets="dev${data_affix} eval${data_affix}" + +# Get dev and eval set names from the test_sets +dev_set=$( echo $test_sets | cut -d " " -f1 ) +eval_set=$( echo $test_sets | cut -d " " -f2 ) + +# please change the path accordingly. We need the original LibriCSS +# corpus to get the oracle segments (for evaluation purpose), and +# also the path to the separated wav files +libricss_corpus=/export/fs01/LibriCSS/ + +# Separated wav files +wav_files_dir=/export/c03/zhuc/css/connected_continuous_separation + +########################################################################## +# We first prepare the CSS data in the Kaldi data format. We use session 0 +# for dev and others for eval. Since separation has been performed before- +# hand, each recording will contain multiple streams. We do not make any +# assumptions on the number of streams, so that this recipe is extensible +# to other speech separation methods. However, the following script may +# need to be modified depending on the naming conventions used for the +# wav files. +########################################################################## +if [ $stage -le 0 ]; then + local/data_prep_css.sh --data-affix "$data_affix" \ + $libricss_corpus $wav_files_dir +fi + +####################################################################### +# Perform SAD on the dev/eval data using py-webrtcvad package +####################################################################### + +if [ $stage -le 1 ]; then + for datadir in ${test_sets}; do + test_set=data/${datadir} + if [ $sad_type == "webrtc" ]; then + echo "Applying WebRTC-VAD on ${datadir}" + local/segmentation/apply_webrtcvad.py --mode 2 $test_set | sort > $test_set/segments + else + echo "Applying TDNN-Stats-SAD on ${datadir}" + if [ ! -f ${test_set}/wav.scp ]; then + echo "$0: Not performing SAD on ${test_set}" + exit 0 + fi + + sad_nj=$(wc -l < "$test_set/wav.scp") + nj=$(echo $((decode_nj>sad_nj ? sad_nj : decode_nj))) + # Perform segmentation. We use the pretrained SAD available at: + # http://kaldi-asr.org/models/4/0004_tdnn_stats_asr_sad_1a.tar.gz + # Download and extract using tar -xvzf + if [ ! -d exp/segmentation_1a/tdnn_stats_asr_sad_1a ]; then + wget http://kaldi-asr.org/models/4/0004_tdnn_stats_asr_sad_1a.tar.gz + tar -xvzf 0004_tdnn_stats_asr_sad_1a.tar.gz + fi + local/detect_speech_activity.sh --cmd "$decode_cmd" --nj $nj $test_set \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a + + # The pretrained SAD used a different MFCC config. We need to + # copy back our old config files. + cp -r ../s5_mono/conf . + fi + + # Create dummy utt2spk file from obtained segments + awk '{print $1, $2}' ${test_set}/segments > ${test_set}/utt2spk + utils/utt2spk_to_spk2utt.pl ${test_set}/utt2spk > ${test_set}/spk2utt + + # Generate RTTM file from segmentation performed by SAD. This can + # be used to evaluate the performance of the SAD as an intermediate + # step. Note that we remove the "stream" from the segments file reco_id + # here because our ground truth does not have these. This means that + # we will have overlapping segments, but that is allowed in the evaluation. + awk '{$2=$2;sub(/_[0-9]*$/, "", $2); print}' ${test_set}/segments > ${test_set}/segments.score + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ + ${test_set}/utt2spk ${test_set}/segments.score ${test_set}/rttm + rm $test_set/segments.score + + echo "Scoring $datadir.." + # We first generate the reference RTTM from the backed up utt2spk and segments + # files. + ref_rttm=${test_set}/ref_rttm + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py ${test_set}/utt2spk.bak \ + ${test_set}/segments.bak ${test_set}/ref_rttm + + md-eval.pl -r $ref_rttm -s ${test_set}/rttm |\ + awk 'or(/MISSED SPEECH/,/FALARM SPEECH/)' + + done +fi + +####################################################################### +# Feature extraction for the dev and eval data +####################################################################### +if [ $stage -le 2 ]; then + # mfccdir should be some place with a largish disk where you + # want to store MFCC features. + mfccdir=mfcc + for x in ${test_sets}; do + utils/fix_data_dir.sh data/$x + nj=$(wc -l < "data/$x/wav.scp") + steps/make_mfcc.sh --nj $decode_nj --cmd "$train_cmd" \ + --mfcc-config conf/mfcc_hires.conf \ + data/$x exp/make_mfcc/$x $mfccdir + done +fi + +####################################################################### +# Perform diarization on the dev/eval data +####################################################################### +if [ $stage -le 3 ]; then + for datadir in ${test_sets}; do + ref_rttm=data/${datadir}/ref_rttm + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py data/${datadir}/utt2spk.bak \ + data/${datadir}/segments.bak $ref_rttm + diar_nj=$(wc -l < "data/$datadir/wav.scp") + + [ ! -d exp/xvector_nnet_1a ] && ./local/download_diarizer.sh + + nj=$(echo $((decode_nj>diar_nj ? diar_nj : decode_nj))) + local/diarize_css.sh --nj $nj --cmd "$train_cmd" --stage $diarizer_stage \ + --ref-rttm $ref_rttm --post-process-rttm true \ + exp/xvector_nnet_1a \ + data/${datadir} \ + exp/${datadir}_diarization + done +fi + +####################################################################### +# Decode diarized output using trained chain model +####################################################################### +if [ $stage -le 4 ]; then + for datadir in ${test_sets}; do + asr_nj=$(wc -l < "data/$datadir/wav.scp") + nj=$(echo $((decode_nj>asr_nj ? asr_nj : decode_nj))) + local/decode_diarized_css.sh --nj $nj --cmd "$decode_cmd" --stage $decode_diarize_stage \ + --lm-suffix "_tgsmall" --acwt 1.0 --post-decode-acwt 10.0 \ + exp/${datadir}_diarization/rttm.post data/$datadir data/lang_test_tgsmall \ + exp/chain${nnet3_affix}/tdnn_${affix} exp/nnet3${nnet3_affix} \ + data/${datadir}_diarized || exit 1 + done +fi + +####################################################################### +# Score decoded dev/eval sets (only if we are not rescoring) +####################################################################### +if [ $stage -le 5 ] && [ ! $rnnlm_rescore ]; then + # please specify both dev and eval set directories so that the search parameters + # (insertion penalty and language model weight) will be tuned using the dev set + local/score_reco_diarized.sh --cmd "$train_cmd" --stage $score_stage \ + --multistream true \ + --dev_decodedir exp/chain${nnet3_affix}/tdnn_${affix}/decode_${dev_set}_diarized_2stage \ + --dev_datadir ${dev_set}_diarized_hires \ + --eval_decodedir exp/chain${nnet3_affix}/tdnn_${affix}/decode_${eval_set}_diarized_2stage \ + --eval_datadir ${eval_set}_diarized_hires +fi + +############################################################################ +# RNNLM rescoring +############################################################################ +if $rnnlm_rescore; then + if [ $stage -le 6 ]; then + echo "$0: Perform RNNLM lattice-rescoring" + pruned= + ac_model_dir=exp/chain${nnet3_affix}/tdnn_${affix} + if $pruned_rescore; then + pruned=_pruned + fi + for decode_set in $test_sets; do + decode_dir=${ac_model_dir}/decode_${decode_set}_diarized_2stage + # Lattice rescoring + rnnlm/lmrescore$pruned.sh \ + --cmd "$decode_cmd --mem 8G" \ + --weight 0.45 --max-ngram-order $ngram_order \ + data/lang_test_tgsmall $rnnlm_dir \ + data/${decode_set}_diarized_hires ${decode_dir} \ + ${ac_model_dir}/decode_${decode_set}_diarized_2stage_rescore + done + fi + + if [ $stage -le 7 ]; then + echo "$0: WERs after rescoring with $rnnlm_dir" + local/score_reco_diarized.sh --cmd "$train_cmd" --stage $score_stage \ + --multistream true \ + --dev_decodedir exp/chain${nnet3_affix}/tdnn_${affix}/decode_${dev_set}_diarized_2stage_rescore \ + --dev_datadir ${dev_set}_diarized_hires \ + --eval_decodedir exp/chain${nnet3_affix}/tdnn_${affix}/decode_${eval_set}_diarized_2stage_rescore \ + --eval_datadir ${eval_set}_diarized_hires + fi +fi + +exit 0; + diff --git a/egs/libri_css/s5_css/sid b/egs/libri_css/s5_css/sid new file mode 120000 index 00000000000..893a12f30c9 --- /dev/null +++ b/egs/libri_css/s5_css/sid @@ -0,0 +1 @@ +../../sre08/v1/sid \ No newline at end of file diff --git a/egs/libri_css/s5_css/steps b/egs/libri_css/s5_css/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/libri_css/s5_css/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/libri_css/s5_css/utils b/egs/libri_css/s5_css/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/libri_css/s5_css/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/libri_css/s5_mono/cmd.sh b/egs/libri_css/s5_mono/cmd.sh new file mode 100644 index 00000000000..811adcde474 --- /dev/null +++ b/egs/libri_css/s5_mono/cmd.sh @@ -0,0 +1,14 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" diff --git a/egs/libri_css/s5_mono/conf/mfcc.conf b/egs/libri_css/s5_mono/conf/mfcc.conf new file mode 100644 index 00000000000..32988403b00 --- /dev/null +++ b/egs/libri_css/s5_mono/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false +--sample-frequency=16000 diff --git a/egs/libri_css/s5_mono/conf/mfcc_hires.conf b/egs/libri_css/s5_mono/conf/mfcc_hires.conf new file mode 100644 index 00000000000..fd64b62eb16 --- /dev/null +++ b/egs/libri_css/s5_mono/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=16000 +--num-mel-bins=40 +--num-ceps=40 +--low-freq=40 +--high-freq=-400 diff --git a/egs/libri_css/s5_mono/conf/online_cmvn.conf b/egs/libri_css/s5_mono/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/libri_css/s5_mono/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/libri_css/s5_mono/diarization b/egs/libri_css/s5_mono/diarization new file mode 120000 index 00000000000..bad937c1444 --- /dev/null +++ b/egs/libri_css/s5_mono/diarization @@ -0,0 +1 @@ +../../callhome_diarization/v1/diarization \ No newline at end of file diff --git a/egs/libri_css/s5_mono/local/best_wer_matching.py b/egs/libri_css/s5_mono/local/best_wer_matching.py new file mode 100755 index 00000000000..1eda3025652 --- /dev/null +++ b/egs/libri_css/s5_mono/local/best_wer_matching.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +# Copyright 2020 Desh Raj +# Apache 2.0. + +import sys, io +import itertools +import numpy as np +from scipy.optimize import linear_sum_assignment +import math + +# This class stores all information about a ref/hyp matching +class WerObject: + # By default, we set the errors to very high values to + # handle the error case. + id = '' + ref_id = '' + hyp_id= '' + wer = 0 + num_ins = 0 + num_del = 0 + num_sub = 0 + wc = 0 + + def __init__(self, line): + self.id, details = line.strip().split(maxsplit=1) + tokens = details.split() + self.wer = float(tokens[1]) + self.wc = int(tokens[5][:-1]) + self.num_ins = int(tokens[6]) + self.num_del = int(tokens[8]) + self.num_sub = int(tokens[10]) + self.ref_id, self.hyp_id = self.id[1:].split('h') + + +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') + +# First we read all lines and create a list of WER objects +wer_objects=[] +for line in infile: + if not line or line.isspace(): + continue + wer_object = WerObject(line) + wer_objects.append(wer_object) + +# Now we create a matrix of costs (WER) which we will use to solve +# a linear sum assignment problem +sort(wer_objects, key=lambda x: x.ref_id) +wer_object_matrix = [list(g) for ref_id, g in itertools.groupby(wer_objects, lambda x: x.ref_id)] +if len(wer_object_matrix) > len(wer_object_matrix[0]): + # More references than hypothesis; take transpose + wer_object_matrix = [*zip(*wer_object_matrix)] +wer_matrix = np.array([[1000 if math.isnan(obj.wer) else obj.wer + for obj in row] + for row in wer_object_matrix]) + +# Solve the assignment problem and compute WER statistics +row_ind, col_ind = linear_sum_assignment(wer_matrix) +total_ins = 0 +total_del = 0 +total_sub = 0 +total_wc = 0 +for row,col in zip(row_ind,col_ind): + total_ins += wer_object_matrix[row][col].num_ins + total_del += wer_object_matrix[row][col].num_del + total_sub += wer_object_matrix[row][col].num_sub + total_wc += wer_object_matrix[row][col].wc +total_error = total_ins+total_del+total_sub +wer = float(100*total_error)/total_wc + +# Write the final statistics to stdout +print ("%WER {:.2f} [ {} / {}, {} ins, {} del, {} sub ]".format(wer, total_error, total_wc, + total_ins, total_del, total_sub)) diff --git a/egs/libri_css/s5_mono/local/chain/run_chain_common.sh b/egs/libri_css/s5_mono/local/chain/run_chain_common.sh new file mode 100755 index 00000000000..fddda061e19 --- /dev/null +++ b/egs/libri_css/s5_mono/local/chain/run_chain_common.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash + +# this script has common stages shared across librispeech chain recipes. +# It generates a new topology in a new lang directory, gets the alignments as +# lattices, and builds a tree for the new topology +set -e + +stage=11 + +# input directory names. These options are actually compulsory, and they have +# been named for convenience +gmm_dir= +ali_dir= +lores_train_data_dir= + +num_leaves=6000 + +# output directory names. They are also compulsory. +lang= +lat_dir= +tree_dir= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1; +[ -z $lat_dir ] && echo "Set --lat-dir, this specifies the experiment directory to store lattice" && exit 1; +[ -z $tree_dir ] && echo "Set --tree-dir, this specifies the directory to store new tree " && exit 1; + +for f in $gmm_dir/final.mdl $ali_dir/ali.1.gz $lores_train_data_dir/feats.scp; do + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 +done + +if [ $stage -le 11 ]; then + echo "$0: creating lang directory with one state per phone." + # Create a version of the lang/ directory that has one state per phone in the + # topo file. [note, it really has two states.. the first one is only repeated + # once, the second one has zero or more repeats.] + if [ -d $lang ]; then + if [ $lang/L.fst -nt data/lang/L.fst ]; then + echo "$0: $lang already exists, not overwriting it; continuing" + else + echo "$0: $lang already exists and seems to be older than data/lang..." + echo " ... not sure what to do. Exiting." + exit 1; + fi + else + cp -r data/lang $lang + silphonelist=$(cat $lang/phones/silence.csl) + nonsilphonelist=$(cat $lang/phones/nonsilence.csl) + # Use our special topology... note that later on may have to tune this + # topology. + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo + fi +fi + +if [ $stage -le 12 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + nj=$(cat ${ali_dir}/num_jobs) + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ + $lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 13 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} $lang $ali_dir $tree_dir +fi diff --git a/egs/libri_css/s5_mono/local/chain/run_tdnn.sh b/egs/libri_css/s5_mono/local/chain/run_tdnn.sh new file mode 120000 index 00000000000..e1adaa9346d --- /dev/null +++ b/egs/libri_css/s5_mono/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1d.sh \ No newline at end of file diff --git a/egs/libri_css/s5_mono/local/chain/tuning/run_tdnn_1d.sh b/egs/libri_css/s5_mono/local/chain/tuning/run_tdnn_1d.sh new file mode 100755 index 00000000000..8e0a9f415a0 --- /dev/null +++ b/egs/libri_css/s5_mono/local/chain/tuning/run_tdnn_1d.sh @@ -0,0 +1,171 @@ +#!/usr/bin/env bash +set -e + +# This training script is taken directly from Librispeech tdnn_1d. We +# remove the decode stages since we don't need them for this recipe. + +# configs for 'chain' +stage=0 +decode_nj=50 +train_set=train_960_cleaned +gmm=tri6b_cleaned +nnet3_affix=_cleaned + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1d +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# TDNN options +frames_per_eg=150,110,100 +remove_egs=true +common_egs_dir= +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf16 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf17 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + linear-component name=prefinal-l dim=256 $linear_opts + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 15 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b{09,10,11,12}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --constrained false" \ + --egs.chunk-width $frames_per_eg \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 2500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.00015 \ + --trainer.optimization.final-effective-lrate 0.000015 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; + +fi + +exit 0; diff --git a/egs/libri_css/s5_mono/local/chain/tuning/run_tdnn_1d_ft.sh b/egs/libri_css/s5_mono/local/chain/tuning/run_tdnn_1d_ft.sh new file mode 100755 index 00000000000..d965194d98a --- /dev/null +++ b/egs/libri_css/s5_mono/local/chain/tuning/run_tdnn_1d_ft.sh @@ -0,0 +1,241 @@ +#!/usr/bin/env bash +set -e + +# This script fine tunes a pretrained model on additional data +# which is reverberant, like LibriCSS. We only fine tune for +# 1 epoch. + +# configs for 'chain' +stage=0 +nj=40 +decode_nj=50 +train_set=train_960_cleaned +gmm=tri6b_cleaned +nnet3_affix=_cleaned + +# Pretrained models for AM and i-vector extractor +src_model_dir=../s5_css/exp/chain$nnet3_affix/tdnn_1d2_sp +ivector_extractor=exp/nnet3$nnet3_affix/extractor +primary_lr_factor=0.1 # The learning-rate factor for transferred layers from source + # model. e.g. if 0, the paramters transferred from source model + # are fixed. + # The learning-rate factor for new added layers is 1.0. + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1d2_ft +tree_affix=reverb +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# TDNN options +frames_per_eg=150,110,100 +remove_egs=true +common_egs_dir= +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + tdnnf-layer name=tdnnf18 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 input=tdnnf17.batchnorm + ## adding the layers for chain branch + linear-component name=prefinal-l dim=256 $linear_opts + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --existing-model $src_model_dir/final.mdl \ + --xconfig-file $dir/configs/network.xconfig \ + --config-dir $dir/configs/ + + # Set the learning-rate-factor to be primary_lr_factor for transferred layers " + # and adding new layers to them. + $train_cmd $dir/log/generate_input_mdl.log \ + nnet3-copy --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor" $src_model_dir/final.mdl - \| \ + nnet3-init --srand=1 - $dir/configs/final.config $dir/input.raw || exit 1; +fi + +if [ $stage -le 8 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b{09,10,11,12}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --trainer.input-model $dir/input.raw \ + --feat.cmvn-opts "--norm-means=true --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --constrained false" \ + --egs.chunk-width $frames_per_eg \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 2500000 \ + --trainer.num-epochs 1 \ + --trainer.optimization.num-jobs-initial 6 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.0001 \ + --trainer.optimization.final-effective-lrate 0.00001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --use-gpu=wait \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; + +fi + +exit 0; diff --git a/egs/libri_css/s5_mono/local/convert_rttm_to_utt2spk_and_segments.py b/egs/libri_css/s5_mono/local/convert_rttm_to_utt2spk_and_segments.py new file mode 100755 index 00000000000..247aba67b46 --- /dev/null +++ b/egs/libri_css/s5_mono/local/convert_rttm_to_utt2spk_and_segments.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +# Copyright 2019 Vimal Manohar +# Apache 2.0. + +"""This script converts an RTTM with +speaker info into kaldi utt2spk and segments""" + +import argparse + +def get_args(): + parser = argparse.ArgumentParser( + description="""This script converts an RTTM with + speaker info into kaldi utt2spk and segments""") + parser.add_argument("--use-reco-id-as-spkr", type=str, + choices=["true", "false"], default="false", + help="Use the recording ID based on RTTM and " + "reco2file_and_channel as the speaker") + parser.add_argument("--append-reco-id-to-spkr", type=str, + choices=["true", "false"], default="false", + help="Append recording ID to the speaker ID") + + parser.add_argument("rttm_file", type=str, + help="""Input RTTM file. + The format of the RTTM file is + """ + """ """) + parser.add_argument("reco2file_and_channel", type=str, + help="""Input reco2file_and_channel. + The format is .""") + parser.add_argument("utt2spk", type=str, + help="Output utt2spk file") + parser.add_argument("segments", type=str, + help="Output segments file") + + args = parser.parse_args() + + args.use_reco_id_as_spkr = bool(args.use_reco_id_as_spkr == "true") + args.append_reco_id_to_spkr = bool(args.append_reco_id_to_spkr == "true") + + if args.use_reco_id_as_spkr: + if args.append_reco_id_to_spkr: + raise Exception("Appending recording ID to speaker does not make sense when using --use-reco-id-as-spkr=true") + + return args + +def main(): + args = get_args() + + file_and_channel2reco = {} + utt2spk={} + segments={} + for line in open(args.reco2file_and_channel): + parts = line.strip().split() + file_and_channel2reco[(parts[1], parts[2])] = parts[0] + + utt2spk_writer = open(args.utt2spk, 'w') + segments_writer = open(args.segments, 'w') + for line in open(args.rttm_file): + parts = line.strip().split() + if parts[0] != "SPEAKER": + continue + + file_id = parts[1] + channel = parts[2] + + try: + reco = file_and_channel2reco[(file_id, channel)] + except KeyError as e: + raise Exception("Could not find recording with " + "(file_id, channel) " + "= ({0},{1}) in {2}: {3}\n".format( + file_id, channel, + args.reco2file_and_channel, str(e))) + + start_time = float(parts[3]) + end_time = start_time + float(parts[4]) + + if args.use_reco_id_as_spkr: + spkr = reco + else: + if args.append_reco_id_to_spkr: + spkr = parts[7] + "_" + reco + else: + spkr = parts[7] + + st = int(start_time * 100) + end = int(end_time * 100) + utt = "{0}_{1:06d}_{2:06d}".format(spkr, st, end) + utt2spk[utt]=spkr + segments[utt]=(reco, start_time, end_time) + + for uttid_id in sorted(utt2spk): + utt2spk_writer.write("{0} {1}\n".format(uttid_id, utt2spk[uttid_id])) + segments_writer.write("{0} {1} {2:7.2f} {3:7.2f}\n".format( + uttid_id, segments[uttid_id][0], segments[uttid_id][1], segments[uttid_id][2])) + +if __name__ == '__main__': + main() diff --git a/egs/libri_css/s5_mono/local/data_prep_css.sh b/egs/libri_css/s5_mono/local/data_prep_css.sh new file mode 100755 index 00000000000..5029b05e9af --- /dev/null +++ b/egs/libri_css/s5_mono/local/data_prep_css.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash +# +# Copyright 2020 Johns Hopkins University (Author: Desh Raj) +# Apache 2.0 + +# Begin configuration section. +# End configuration section +data_affix= +volume=1 + +. ./utils/parse_options.sh # accept options + +. ./path.sh + +echo >&2 "$0" "$@" +if [ $# -ne 2 ] ; then + echo >&2 "$0" "$@" + echo >&2 "$0: Error: wrong number of arguments" + echo -e >&2 "Usage:\n $0 [opts] " + echo -e >&2 "eg:\n $0 /export/corpora/LibriCSS /export/c01/zhuc/css_data" + exit 1 +fi + +corpus_dir=$1 +wav_files_dir=$2 + +set -e -o pipefail + +# If data is not already present, then download and unzip +if [ ! -d $corpus_dir/for_release ]; then + echo "Downloading and unpacking LibriCSS data." + CWD=`pwd` + mkdir -p $corpus_dir + + cd $corpus_dir + + # Download the data. If the data has already been downloaded, it + # does nothing. (See wget -c) + wget -c --load-cookies /tmp/cookies.txt \ + "https://docs.google.com/uc?export=download&confirm=$(wget --quiet \ + --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate \ + 'https://docs.google.com/uc?export=download&id=1Piioxd5G_85K9Bhcr8ebdhXx0CnaHy7l' \ + -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1Piioxd5G_85K9Bhcr8ebdhXx0CnaHy7l" \ + -O for_release.zip && rm -rf /tmp/cookies.txt + + # unzip (skip if already extracted) + unzip -n for_release.zip + + # segmentation + cd for_release + python3 segment_libricss.py -data_path . + + cd $CWD +fi + +# Process the downloaded data directory to get data in Kaldi format +# We first copy all the separated wav files from the original location +# without any directory structure. Here, the wav naming convention is +# similar to that in the LibriCSS corpus meeting directories, with an +# additional `channel_n` at the end denoting the stream number, e.g. +# overlap_ratio_10.0_sil0.1_1.0_session0_actual10.1_channel_0.wav +# note that this "channel" is actually one of the separated audio streams. +mkdir -p data/local/data${data_affix}/wavs_orig +find $wav_files_dir -name '*.wav' -exec cp {} data/local/data${data_affix}/wavs_orig \; +local/prepare_data_css.py --srcpath $corpus_dir/for_release --wav-path data/local/data${data_affix}/wavs_orig \ + --tgtpath data/local/data${data_affix} --volume $volume + +# Create dev and eval splits based on sessions. In total we have 10 sessions (session0 to +# session9) of approximately 1 hour each. +dev_sessions="session0" +eval_sessions="session[1-9]" + +mkdir -p data/dev${data_affix} +for file in wav.scp utt2spk text segments; do + grep $dev_sessions data/local/data${data_affix}/$file | sort > data/dev${data_affix}/$file +done + +mkdir -p data/eval${data_affix} +for file in wav.scp utt2spk text segments; do + grep $eval_sessions data/local/data${data_affix}/$file | sort > data/eval${data_affix}/$file +done + +# Move the utt2spk, segments, and text file to .bak so that they are only used +# in the last scoring stage. We also prepare a dummy utt2spk and spk2utt for +# these. +for datadir in dev eval; do + for file in text utt2spk segments; do + mv data/$datadir${data_affix}/$file data/$datadir${data_affix}/$file.bak + done + + awk '{print $1, $1}' data/$datadir${data_affix}/wav.scp > data/$datadir${data_affix}/utt2spk + utils/utt2spk_to_spk2utt.pl data/$datadir${data_affix}/utt2spk > data/$datadir${data_affix}/spk2utt + +done diff --git a/egs/libri_css/s5_mono/local/data_prep_librispeech.sh b/egs/libri_css/s5_mono/local/data_prep_librispeech.sh new file mode 100755 index 00000000000..cbdd147b2df --- /dev/null +++ b/egs/libri_css/s5_mono/local/data_prep_librispeech.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash + +# Copyright 2014 Vassil Panayotov +# 2014 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean" + exit 1 +fi + +src=$1 +dst=$2 + +spk_file=$src/../SPEAKERS.TXT + +mkdir -p $dst || exit 1; + +[ ! -d $src ] && echo "$0: no such directory $src" && exit 1; +[ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1; + + +wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp +trans=$dst/text; [[ -f "$trans" ]] && rm $trans +utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk +spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender + +for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do + reader=$(basename $reader_dir) + if ! [ $reader -eq $reader ]; then # not integer. + echo "$0: unexpected subdirectory name $reader" + exit 1; + fi + + reader_gender=$(egrep "^$reader[ ]+\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($2)}') + if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then + echo "Unexpected gender: '$reader_gender'" + exit 1; + fi + + for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do + chapter=$(basename $chapter_dir) + if ! [ "$chapter" -eq "$chapter" ]; then + echo "$0: unexpected chapter-subdirectory name $chapter" + exit 1; + fi + + find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \ + awk -v "dir=$chapter_dir" '{printf "%s sox %s/%s.flac -t wav - |\n", $0, dir, $0}' >>$wav_scp|| exit 1 + + chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt + [ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1 + cat $chapter_trans >>$trans + + # NOTE: For now we are using per-chapter utt2spk. That is each chapter is considered + # to be a different speaker. This is done for simplicity and because we want + # e.g. the CMVN to be calculated per-chapter + awk -v "reader=$reader" -v "chapter=$chapter" '{printf "%s %s-%s\n", $1, reader, chapter}' \ + <$chapter_trans >>$utt2spk || exit 1 + + # reader -> gender map (again using per-chapter granularity) + echo "${reader}-${chapter} $reader_gender" >>$spk2gender + done +done + +spk2utt=$dst/spk2utt +utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1 + +ntrans=$(wc -l <$trans) +nutt2spk=$(wc -l <$utt2spk) +! [ "$ntrans" -eq "$nutt2spk" ] && \ + echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1; + +utils/validate_data_dir.sh --no-feats $dst || exit 1; + +echo "$0: successfully prepared data in $dst" + +exit 0 diff --git a/egs/libri_css/s5_mono/local/data_prep_mono.sh b/egs/libri_css/s5_mono/local/data_prep_mono.sh new file mode 100755 index 00000000000..75f661f79bb --- /dev/null +++ b/egs/libri_css/s5_mono/local/data_prep_mono.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +# +# Copyright 2020 Johns Hopkins University (Author: Desh Raj) +# Apache 2.0 + +# Begin configuration section. +# End configuration section +data_affix= + +. ./utils/parse_options.sh # accept options + +. ./path.sh + +echo >&2 "$0" "$@" +if [ $# -ne 2 ] ; then + echo >&2 "$0" "$@" + echo >&2 "$0: Error: wrong number of arguments" + echo -e >&2 "Usage:\n $0 [opts] " + echo -e >&2 "eg:\n $0 /export/corpora/LibriCSS /export/corpora/LibriSpeech" + exit 1 +fi + +corpus_dir=$1 +librispeech_dir=$2 + +set -e -o pipefail + +# If data is not already present, then download and unzip +if [ ! -d $corpus_dir/for_release ]; then + echo "Downloading and unpacking LibriCSS data." + CWD=`pwd` + mkdir -p $corpus_dir + + cd $corpus_dir + + # Download the data. If the data has already been downloaded, it + # does nothing. (See wget -c) + wget -c --load-cookies /tmp/cookies.txt \ + "https://docs.google.com/uc?export=download&confirm=$(wget --quiet \ + --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate \ + 'https://docs.google.com/uc?export=download&id=1Piioxd5G_85K9Bhcr8ebdhXx0CnaHy7l' \ + -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1Piioxd5G_85K9Bhcr8ebdhXx0CnaHy7l" \ + -O for_release.zip && rm -rf /tmp/cookies.txt + + # unzip (skip if already extracted) + unzip -n for_release.zip + + # segmentation + cd for_release + python3 segment_libricss.py -data_path . + + cd $CWD +fi + +# Process the downloaded data directory to get data in Kaldi format +if ! [ -d data/local/data${data_affix} ]; then + mkdir -p data/local/data${data_affix}/ + local/prepare_data.py --srcpath $corpus_dir/for_release --tgtpath data/local/data${data_affix} --mics 0 \ + --cleanpath $librispeech_dir +fi + +# Create dev and eval splits based on sessions. In total we have 10 sessions (session0 to +# session9) of approximately 1 hour each. In the below strings, separate each session by +# '\|' to perform grep at once. +dev_sessions="session0" +eval_sessions="session[1-9]" + +mkdir -p data/dev${data_affix} +for file in wav.scp utt2spk text segments wav_clean.scp; do + grep $dev_sessions data/local/data${data_affix}/$file | sort > data/dev${data_affix}/$file +done + +mkdir -p data/eval${data_affix} +for file in wav.scp utt2spk text segments wav_clean.scp; do + grep $eval_sessions data/local/data${data_affix}/$file | sort > data/eval${data_affix}/$file +done + +# Move the utt2spk, segments, and text file to .bak so that they are only used +# in the last scoring stage. We also prepare a dummy utt2spk and spk2utt for +# these. +for datadir in dev eval; do + for file in text utt2spk segments; do + mv data/$datadir${data_affix}/$file data/$datadir${data_affix}/$file.bak + done + + awk '{print $1, $1}' data/$datadir${data_affix}/wav.scp > data/$datadir${data_affix}/utt2spk + utils/utt2spk_to_spk2utt.pl data/$datadir${data_affix}/utt2spk > data/$datadir${data_affix}/spk2utt + +done diff --git a/egs/libri_css/s5_mono/local/decode.sh b/egs/libri_css/s5_mono/local/decode.sh new file mode 100755 index 00000000000..620d0319927 --- /dev/null +++ b/egs/libri_css/s5_mono/local/decode.sh @@ -0,0 +1,227 @@ +#!/usr/bin/env bash +# +# This script decodes raw utterances through the entire pipeline: +# VAD -> Feature extraction -> Diarization -> ASR +# +# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal) +# 2019 Desh Raj, David Snyder, Ashish Arora, Zhaoheng Ni +# Apache 2.0 + +# Begin configuration section. +nj=8 +stage=0 + +diarizer_stage=1 +decode_diarize_stage=0 +decode_oracle_stage=0 +score_stage=0 +nnet3_affix=_cleaned # affix for the chain directory name +affix=1d_ft # affix for the TDNN directory name + +# If the following is set to true, we use the oracle speaker and segment +# information instead of performing SAD and diarization. +use_oracle_segments= +sad_type=webrtc # Set this to webrtc or tdnn +rnnlm_rescore=true + +# RNNLM rescore options +ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order + # if it's set, it merges histories in the lattice if they share + # the same ngram history and this prevents the lattice from + # exploding exponentially +pruned_rescore=true +rnnlm_dir=exp/rnnlm_lstm_1a + +test_sets="dev eval" + +. ./utils/parse_options.sh + +. ./cmd.sh +. ./path.sh + +# Get dev and eval set names from the test_sets +dev_set=$( echo $test_sets | cut -d " " -f1 ) +eval_set=$( echo $test_sets | cut -d " " -f2 ) + +$use_oracle_segments && [ $stage -le 8 ] && stage=8 + +####################################################################### +# Perform SAD on the dev/eval data using py-webrtcvad package +####################################################################### +if [ $stage -le 1 ]; then + for datadir in ${test_sets}; do + test_set=data/${datadir} + if [ $sad_type == "webrtc" ]; then + echo "Applying WebRTC-VAD on ${datadir}" + local/segmentation/apply_webrtcvad.py --mode 0 $test_set | sort > $test_set/segments + else + echo "Applying TDNN-Stats-SAD on ${datadir}" + if [ ! -f ${test_set}/wav.scp ]; then + echo "$0: Not performing SAD on ${test_set}, since wav.scp does not exist. Exiting!" + exit 0 + fi + + sad_nj=$(wc -l < "$test_set/wav.scp") + nj=$((decode_nj>sad_nj ? sad_nj : decode_nj)) + # Perform segmentation. We use the pretrained CHiME-6 SAD available at: + # http://kaldi-asr.org/models/12/0012_sad_v1.tar.gz + # Download and extract using tar -xvzf + if [ ! -d exp/segmentation_1a/tdnn_stats_sad_1a ]; then + wget http://kaldi-asr.org/models/12/0012_sad_v1.tar.gz || exit + tar -xvzf 0012_sad_v1.tar.gz + cp -r 0012_sad_v1/conf/* conf/ + cp -r 0012_sad_v1/exp/segmentation_1a exp/ + fi + local/detect_speech_activity.sh --cmd "$decode_cmd" --nj $sad_nj $test_set \ + exp/segmentation_1a/tdnn_stats_sad_1a + fi + + # Create dummy utt2spk file from obtained segments + awk '{print $1, $2}' ${test_set}/segments > ${test_set}/utt2spk + utils/utt2spk_to_spk2utt.pl ${test_set}/utt2spk > ${test_set}/spk2utt + + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ + ${test_set}/utt2spk ${test_set}/segments ${test_set}/rttm + + echo "Scoring $datadir.." + # We first generate the reference RTTM from the backed up utt2spk and segments + # files. + ref_rttm=${test_set}/ref_rttm + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py ${test_set}/utt2spk.bak \ + ${test_set}/segments.bak ${test_set}/ref_rttm + + md-eval.pl -r $ref_rttm -s ${test_set}/rttm |\ + awk '/(MISSED|FALARM) SPEECH/' + + done +fi + +####################################################################### +# Feature extraction for the dev and eval data +####################################################################### +if [ $stage -le 2 ]; then + # mfccdir should be some place with a largish disk where you + # want to store MFCC features. + mfccdir=mfcc + for x in ${test_sets}; do + steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \ + --mfcc-config conf/mfcc_hires.conf \ + data/$x exp/make_mfcc/$x $mfccdir + done +fi + +####################################################################### +# Perform diarization on the dev/eval data +####################################################################### +if [ $stage -le 3 ]; then + for datadir in ${test_sets}; do + ref_rttm=data/${datadir}/ref_rttm + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py data/${datadir}/utt2spk.bak \ + data/${datadir}/segments.bak $ref_rttm + diar_nj=$(wc -l < "data/$datadir/wav.scp") # This is important especially for VB-HMM + + [ ! -d exp/xvector_nnet_1a ] && ./local/download_diarizer.sh + + local/diarize_spectral.sh --nj $diar_nj --cmd "$train_cmd" --stage $diarizer_stage \ + --ref-rttm $ref_rttm \ + exp/xvector_nnet_1a \ + data/${datadir} \ + exp/${datadir}_diarization + done +fi + +####################################################################### +# Decode diarized output using trained chain model +####################################################################### +if [ $stage -le 4 ]; then + for datadir in ${test_sets}; do + asr_nj=$(wc -l < "data/$datadir/wav.scp") + local/decode_diarized.sh --nj $asr_nj --cmd "$decode_cmd" --stage $decode_diarize_stage \ + --lm-suffix "_tgsmall" \ + data/${datadir}/rttm_tsvad data/$datadir data/lang_test_tgsmall \ + exp/chain${nnet3_affix}/tdnn_${affix} exp/nnet3${nnet3_affix} \ + data/${datadir}_diarized || exit 1 + done +fi + +####################################################################### +# Score decoded dev/eval sets +####################################################################### +if [ $stage -le 5 ]; then + # please specify both dev and eval set directories so that the search parameters + # (insertion penalty and language model weight) will be tuned using the dev set + local/score_reco_diarized.sh --cmd "$train_cmd" --stage $score_stage \ + --dev_decodedir exp/chain${nnet3_affix}/tdnn_${affix}_sp/decode_${dev_set}_diarized_2stage \ + --dev_datadir ${dev_set}_diarized_hires \ + --eval_decodedir exp/chain${nnet3_affix}/tdnn_${affix}_sp/decode_${eval_set}_diarized_2stage \ + --eval_datadir ${eval_set}_diarized_hires +fi + +############################################################################ +# RNNLM rescoring +############################################################################ +if $rnnlm_rescore; then + if [ $stage -le 6 ]; then + echo "$0: Perform RNNLM lattice-rescoring" + pruned= + ac_model_dir=exp/chain${nnet3_affix}/tdnn_${affix} + if $pruned_rescore; then + pruned=_pruned + fi + for decode_set in $test_sets; do + decode_dir=${ac_model_dir}/decode_${decode_set}_diarized_2stage + # Lattice rescoring + rnnlm/lmrescore${pruned}.sh \ + --cmd "$decode_cmd --mem 8G" \ + --weight 0.45 --max-ngram-order $ngram_order \ + data/lang_test_tgsmall $rnnlm_dir \ + data/${decode_set}_diarized_hires ${decode_dir} \ + ${ac_model_dir}/decode_${decode_set}_diarized_2stage_rescore + done + fi + + if [ $stage -le 7 ]; then + echo "$0: WERs after rescoring with $rnnlm_dir" + local/score_reco_diarized.sh --cmd "$train_cmd" --stage $score_stage \ + --dev_decodedir exp/chain${nnet3_affix}/tdnn_${affix}/decode_${dev_set}_diarized_2stage_rescore \ + --dev_datadir ${dev_set}_diarized_hires \ + --eval_decodedir exp/chain${nnet3_affix}/tdnn_${affix}/decode_${eval_set}_diarized_2stage_rescore \ + --eval_datadir ${eval_set}_diarized_hires + fi +fi + +$use_oracle_segments || exit 0 + +###################################################################### +# Here we decode using oracle speaker and segment information +###################################################################### +if [ $stage -le 8 ]; then + # mfccdir should be some place with a largish disk where you + # want to store MFCC features. + mfccdir=mfcc + for x in ${test_sets}; do + datadir=data/${x}_oracle + mkdir -p $datadir + + cp data/$x/wav.scp $datadir/ + cp data/$x/segments.bak $datadir/segments + cp data/$x/utt2spk.bak $datadir/utt2spk + cp data/$x/text.bak $datadir/text + utils/utt2spk_to_spk2utt.pl $datadir/utt2spk > $datadir/spk2utt + + steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \ + --mfcc-config conf/mfcc_hires.conf \ + $datadir exp/make_mfcc/$x $mfccdir + done +fi + +if [ $stage -le 9 ]; then + local/decode_oracle.sh --stage $decode_oracle_stage \ + --affix $affix \ + --lang-dir data/lang_test_tgsmall \ + --lm-suffix "_tgsmall" \ + --rnnlm-rescore $rnnlm_rescore \ + --test_sets "$test_sets" +fi + +exit 0; diff --git a/egs/libri_css/s5_mono/local/decode_diarized.sh b/egs/libri_css/s5_mono/local/decode_diarized.sh new file mode 100755 index 00000000000..a8a858bc1ed --- /dev/null +++ b/egs/libri_css/s5_mono/local/decode_diarized.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +# Copyright 2019 Ashish Arora, Vimal Manohar +# Apache 2.0. +# This script takes an rttm file, and performs decoding on on a test directory. +# The output directory contains a text file which can be used for scoring. + + +stage=0 +nj=8 +cmd=run.pl +lm_suffix= + +echo "$0 $@" # Print the command line for logging + +. ./path.sh +. utils/parse_options.sh || exit 1; + +if [ $# != 6 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/rttm data/dev data/lang_chain exp/chain/tdnn_1a \ + exp/nnet3_cleaned data/dev_diarized" + echo "Options: " + echo " --nj # number of parallel jobs." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +rttm=$1 +data_in=$2 +lang_dir=$3 +asr_model_dir=$4 +ivector_extractor=$5 +out_dir=$6 + +for f in $rttm $data_in/wav.scp $data_in/text.bak \ + $lang_dir/L.fst $asr_model_dir/graph${lm_suffix}/HCLG.fst \ + $asr_model_dir/final.mdl; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +if [ $stage -le 0 ]; then + echo "$0 copying data files in output directory" + mkdir -p ${out_dir}_hires + cp ${data_in}/{wav.scp,utt2spk,utt2spk.bak} ${out_dir}_hires + utils/data/get_reco2dur.sh ${out_dir}_hires +fi + +if [ $stage -le 1 ]; then + echo "$0 creating segments file from rttm and utt2spk, reco2file_and_channel " + local/convert_rttm_to_utt2spk_and_segments.py --append-reco-id-to-spkr=true $rttm \ + <(awk '{print $2" "$2" "$3}' $rttm | sort -u) \ + ${out_dir}_hires/utt2spk ${out_dir}_hires/segments + + utils/utt2spk_to_spk2utt.pl ${out_dir}_hires/utt2spk > ${out_dir}_hires/spk2utt + utils/fix_data_dir.sh ${out_dir}_hires || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "$0 extracting mfcc freatures using segments file" + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd queue.pl ${out_dir}_hires + steps/compute_cmvn_stats.sh ${out_dir}_hires + utils/fix_data_dir.sh ${out_dir}_hires || exit 1; + cp $data_in/text.bak ${out_dir}_hires/text +fi + +if [ $stage -le 3 ]; then + utils/mkgraph.sh \ + --self-loop-scale 1.0 --remove-oov $lang_dir \ + $asr_model_dir $asr_model_dir/graph${lm_suffix} +fi + +if [ $stage -le 4 ]; then + echo "$0 performing decoding on the extracted features" + local/nnet3/decode.sh --affix 2stage --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 150 --nj $nj --ivector-dir $ivector_extractor \ + $out_dir $lang_dir $asr_model_dir/graph${lm_suffix} $asr_model_dir/ +fi + diff --git a/egs/libri_css/s5_mono/local/decode_diarized_css.sh b/egs/libri_css/s5_mono/local/decode_diarized_css.sh new file mode 100755 index 00000000000..995901a935d --- /dev/null +++ b/egs/libri_css/s5_mono/local/decode_diarized_css.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +# Copyright 2019 Ashish Arora, Vimal Manohar, Desh Raj +# Apache 2.0. +# This script is similar to the decode_diarized.sh script, except that is +# works on CSS separated audio streams. The key difference here is in how +# we create segments for feature extraction, since now they will have to +# come from the respective streams. + +stage=0 +nj=8 +cmd=run.pl +lm_suffix= +acwt=1.0 +post_decode_acwt=10.0 + +echo "$0 $@" # Print the command line for logging + +. ./path.sh +. utils/parse_options.sh || exit 1; + +if [ $# != 6 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/rttm data/dev data/lang_chain exp/chain/tdnn_1a \ + exp/nnet3_cleaned data/dev_diarized" + echo "Options: " + echo " --nj # number of parallel jobs." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +rttm=$1 +data_in=$2 +lang_dir=$3 +asr_model_dir=$4 +ivector_extractor=$5 +out_dir=$6 + +for f in $rttm $data_in/wav.scp $data_in/text.bak \ + $lang_dir/L.fst $asr_model_dir/final.mdl; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +if [ $stage -le 0 ]; then + echo "$0 copying data files in output directory" + mkdir -p ${out_dir}_hires + cp ${data_in}/{wav.scp,utt2spk.bak} ${out_dir}_hires + utils/data/get_reco2dur.sh ${out_dir}_hires +fi + +if [ $stage -le 1 ]; then + echo "$0 creating segments file from rttm and utt2spk, reco2file_and_channel " + local/convert_rttm_to_utt2spk_and_segments.py --append-reco-id-to-spkr=true $rttm \ + <(awk '{print $2" "$2" "$3}' $rttm | sort -u) \ + ${out_dir}_hires/utt2spk.reco ${out_dir}_hires/segments + + # We remove the stream id from the spk id (for speaker-level CMN) + awk '{$2=$2;sub(/_[0-9]*$/, "", $2); print}' ${out_dir}_hires/utt2spk.reco \ + > ${out_dir}_hires/utt2spk + + utils/utt2spk_to_spk2utt.pl ${out_dir}_hires/utt2spk > ${out_dir}_hires/spk2utt + utils/fix_data_dir.sh ${out_dir}_hires +fi + +if [ $stage -le 2 ]; then + # Now we extract features + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd "$cmd" ${out_dir}_hires + steps/compute_cmvn_stats.sh ${out_dir}_hires + utils/fix_data_dir.sh ${out_dir}_hires || exit 1; + cp $data_in/text.bak ${out_dir}_hires/text +fi + +if [ $stage -le 3 ]; then + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_dir \ + $asr_model_dir $asr_model_dir/graph${lm_suffix} +fi + +if [ $stage -le 4 ]; then + echo "$0 performing decoding on the extracted features" + local/nnet3/decode.sh --affix 2stage --acwt $acwt --post-decode-acwt $post_decode_acwt \ + --frames-per-chunk 150 --nj $nj --ivector-dir $ivector_extractor \ + ${out_dir} $lang_dir $asr_model_dir/graph${lm_suffix} $asr_model_dir/ +fi + diff --git a/egs/libri_css/s5_mono/local/decode_oracle.sh b/egs/libri_css/s5_mono/local/decode_oracle.sh new file mode 100755 index 00000000000..6e82142f927 --- /dev/null +++ b/egs/libri_css/s5_mono/local/decode_oracle.sh @@ -0,0 +1,125 @@ +#!/usr/bin/env bash +# +# Based mostly on the TED-LIUM and Switchboard recipe +# +# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal) +# Apache 2.0 +# +# This script performs recognition with oracle speaker and segment information + +# Begin configuration section. +decode_nj=20 +stage=0 +test_sets= +lang_dir= +lm_suffix= +nnet3_affix=_cleaned # affix for the chain directory name +affix=1d # affix for the TDNN directory name +rnnlm_rescore=false + +# End configuration section +. ./utils/parse_options.sh + +. ./cmd.sh +. ./path.sh + +# RNNLM rescore options +ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order + # if it's set, it merges histories in the lattice if they share + # the same ngram history and this prevents the lattice from + # exploding exponentially +pruned_rescore=true +rnnlm_dir=exp/rnnlm_lstm_1a + +dir=exp/chain${nnet3_affix}/tdnn_${affix} + +# Get dev and eval set names from the test_sets +dev_set=$( echo $test_sets | cut -d " " -f1 ) +eval_set=$( echo $test_sets | cut -d " " -f2 ) + + +set -e # exit on error + +########################################################################## +# DECODING: we perform 2 stage decoding. +########################################################################## + +if [ $stage -le 0 ]; then + # First the options that are passed through to run_ivector_common.sh + # (some of which are also used in this script directly). + + # The rest are configs specific to this script. Most of the parameters + # are just hardcoded at this level, in the commands below. + echo "$0: decode data..." + + # training options + # training chunk-options + chunk_width=150,110,100 + # we don't need extra left/right context for TDNN systems. + chunk_left_context=0 + chunk_right_context=0 + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_dir \ + $dir $dir/graph${lm_suffix} || exit 1; + + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + local/nnet3/decode.sh --affix 2stage --pass2-decode-opts "--min-active 1000" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 150 --nj $decode_nj \ + --ivector-dir exp/nnet3${nnet3_affix} \ + data/${data}_oracle $lang_dir \ + $dir/graph${lm_suffix} \ + exp/chain${nnet3_affix}/tdnn_${affix} + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + + +########################################################################## +# Scoring: here we obtain wer per condition and overall WER +########################################################################## + +if [ $stage -le 1 ]; then + # please specify both dev and eval set directories so that the search parameters + # (insertion penalty and language model weight) will be tuned using the dev set + local/score_reco_oracle.sh \ + --dev exp/chain${nnet3_affix}/tdnn_${affix}/decode_${dev_set}_oracle_2stage \ + --eval exp/chain${nnet3_affix}/tdnn_${affix}/decode_${eval_set}_oracle_2stage +fi + +############################################################################ +# RNNLM rescoring +############################################################################ +if $rnnlm_rescore; then + if [ $stage -le 2 ]; then + echo "$0: Perform RNNLM lattice-rescoring" + pruned= + ac_model_dir=exp/chain${nnet3_affix}/tdnn_${affix} + if $pruned_rescore; then + pruned=_pruned + fi + for decode_set in $test_sets; do + decode_dir=${ac_model_dir}/decode_${decode_set}_oracle_2stage + # Lattice rescoring + rnnlm/lmrescore$pruned.sh \ + --cmd "$decode_cmd --mem 8G" \ + --weight 0.45 --max-ngram-order $ngram_order \ + $lang_dir $rnnlm_dir \ + data/${decode_set}_oracle_hires ${decode_dir} \ + ${ac_model_dir}/decode_${decode_set}_oracle_2stage_rescore + done + fi + if [ $stage -le 3 ]; then + echo "$0: WERs after rescoring with $rnnlm_dir" + local/score_reco_oracle.sh \ + --dev exp/chain${nnet3_affix}/tdnn_${affix}/decode_${dev_set}_oracle_2stage${rescore_dir_suffix} \ + --eval exp/chain${nnet3_affix}/tdnn_${affix}/decode_${eval_set}_oracle_2stage${rescore_dir_suffix} + fi +fi diff --git a/egs/libri_css/s5_mono/local/detect_speech_activity.sh b/egs/libri_css/s5_mono/local/detect_speech_activity.sh new file mode 100755 index 00000000000..1b9b062fb8b --- /dev/null +++ b/egs/libri_css/s5_mono/local/detect_speech_activity.sh @@ -0,0 +1,225 @@ +#!/usr/bin/env bash + +# Copyright 2016-17 Vimal Manohar +# 2017 Nagendra Kumar Goel +# Apache 2.0. + +# This script does nnet3-based speech activity detection given an input +# kaldi data directory and outputs a segmented kaldi data directory. + +set -e +set -o pipefail +set -u + +if [ -f ./path.sh ]; then . ./path.sh; fi + +affix= # Affix for the segmentation +nj=32 +cmd=queue.pl +stage=-1 + +# Feature options (Must match training) +mfcc_config=conf/mfcc_hires.conf +feat_affix= # Affix for the type of feature used + +output_name=output # The output node in the network +sad_name=sad # Base name for the directory storing the computed loglikes + # Can be music for music detection +segmentation_name=segmentation # Base name for the directory doing segmentation + # Can be segmentation_music for music detection + +# SAD network config +iter=final # Model iteration to use + +# Contexts must ideally match training for LSTM models, but +# may not necessarily for stats components +extra_left_context=0 # Set to some large value, typically 40 for LSTM (must match training) +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 +frames_per_chunk=150 + +# Decoding options +graph_opts="--min-silence-duration=0.03 --min-speech-duration=0.3 --max-speech-duration=10.0" +acwt=1.0 + +# These _in__weight represent the fraction of probability +# to transfer to class. +# e.g. --speech-in-sil-weight=0.0 --garbage-in-sil-weight=0.0 --sil-in-speech-weight=0.0 --garbage-in-speech-weight=0.3 +transform_probs_opts="" + +# Postprocessing options +segment_padding=0.2 # Duration (in seconds) of padding added to segments +min_segment_dur=0 # Minimum duration (in seconds) required for a segment to be included + # This is before any padding. Segments shorter than this duration will be removed. + # This is an alternative to --min-speech-duration above. +merge_consecutive_max_dur=0 # Merge consecutive segments as long as the merged segment is no longer than this many + # seconds. The segments are only merged if their boundaries are touching. + # This is after padding by --segment-padding seconds. + # 0 means do not merge. Use 'inf' to not limit the duration. +cleanup=false # If true, remove files created during feature extraction + +echo $* + +. utils/parse_options.sh + +if [ $# -ne 2 ]; then + echo "This script does nnet3-based speech activity detection given an input kaldi " + echo "data directory and outputs an output kaldi data directory." + echo "See script for details of the options to be supplied." + echo "Usage: $0 " + echo " e.g.: $0 ~/workspace/egs/ami/s5b/data/sdm1/dev exp/nnet3_sad_snr/nnet_tdnn_j_n4 \\" + echo " mfcc_hires exp/segmentation_sad_snr/nnet_tdnn_j_n4" + echo "" + echo "Options: " + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --nj # number of parallel jobs to run." + echo " --stage # stage to do partial re-run from." + echo " --convert-data-dir-to-whole # If true, the input data directory is " + echo " # first converted to whole data directory (i.e. whole recordings) " + echo " # and segmentation is done on that." + echo " # If false, then the original segments are " + echo " # retained and they are split into sub-segments." + echo " --output-name # The output node in the network" + echo " --extra-left-context # Set to some large value, typically 40 for LSTM (must match training)" + echo " --extra-right-context # For BLSTM or statistics pooling" + echo " --cleanup # Remove files created during feature extraction" + exit 1 +fi + +src_data_dir=$1 # The input data directory that needs to be segmented. + # If convert_data_dir_to_whole is true, any segments in that will be ignored. +sad_nnet_dir=$2 # The SAD neural network + +dir=exp/segmentation${affix} + +affix=${affix:+_$affix} +feat_affix=${feat_affix:+_$feat_affix} + +data_id=`basename $src_data_dir` +sad_dir=${dir}/${sad_name}${affix}_${data_id}${feat_affix} +seg_dir=${dir}/${segmentation_name}${affix}_${data_id}${feat_affix} + +############################################################################### +## Forward pass through the network network and dump the log-likelihoods. +############################################################################### + +frame_subsampling_factor=1 +if [ -f $sad_nnet_dir/frame_subsampling_factor ]; then + frame_subsampling_factor=$(cat $sad_nnet_dir/frame_subsampling_factor) +fi + +if [ $stage -le 1 ]; then + # mfccdir should be some place with a largish disk where you + # want to store MFCC features. + mfccdir=mfcc + steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \ + --mfcc-config conf/mfcc_hires.conf \ + $src_data_dir exp/make_mfcc/$data_id $mfccdir +fi + +mkdir -p $dir +if [ $stage -le 2 ]; then + if [ "$(readlink -f $sad_nnet_dir)" != "$(readlink -f $dir)" ]; then + cp $sad_nnet_dir/cmvn_opts $dir || exit 1 + fi + + ######################################################################## + ## Initialize neural network for decoding using the output $output_name + ######################################################################## + + if [ ! -z "$output_name" ] && [ "$output_name" != output ]; then + $cmd $dir/log/get_nnet_${output_name}.log \ + nnet3-copy --edits="rename-node old-name=$output_name new-name=output" \ + $sad_nnet_dir/$iter.raw $dir/${iter}_${output_name}.raw || exit 1 + iter=${iter}_${output_name} + else + if ! diff $sad_nnet_dir/$iter.raw $dir/$iter.raw; then + cp $sad_nnet_dir/$iter.raw $dir/ + fi + fi + + steps/nnet3/compute_output.sh --nj $nj --cmd "$cmd" \ + --iter ${iter} \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk $frames_per_chunk --apply-exp true \ + --frame-subsampling-factor $frame_subsampling_factor \ + ${src_data_dir} $dir $sad_dir || exit 1 +fi + +############################################################################### +## Prepare FST we search to make speech/silence decisions. +############################################################################### + +utils/data/get_utt2dur.sh --nj $nj --cmd "$cmd" $src_data_dir || exit 1 +frame_shift=$(utils/data/get_frame_shift.sh $src_data_dir) || exit 1 + +graph_dir=${dir}/graph_${output_name} +if [ $stage -le 3 ]; then + mkdir -p $graph_dir + + # 1 for silence and 2 for speech + cat < $graph_dir/words.txt + 0 +silence 1 +speech 2 +EOF + + $cmd $graph_dir/log/make_graph.log \ + steps/segmentation/internal/prepare_sad_graph.py $graph_opts \ + --frame-shift=$(perl -e "print $frame_shift * $frame_subsampling_factor") - \| \ + fstcompile --isymbols=$graph_dir/words.txt --osymbols=$graph_dir/words.txt '>' \ + $graph_dir/HCLG.fst +fi + +############################################################################### +## Do Viterbi decoding to create per-frame alignments. +############################################################################### + +post_vec=$sad_nnet_dir/post_${output_name}.vec +if [ ! -f $sad_nnet_dir/post_${output_name}.vec ]; then + if [ ! -f $sad_nnet_dir/post_${output_name}.txt ]; then + echo "$0: Could not find $sad_nnet_dir/post_${output_name}.vec. " + echo "Re-run the corresponding stage in the training script possibly " + echo "with --compute-average-posteriors=true or compute the priors " + echo "from the training labels" + exit 1 + else + post_vec=$sad_nnet_dir/post_${output_name}.txt + fi +fi + +mkdir -p $seg_dir +if [ $stage -le 4 ]; then + steps/segmentation/internal/get_transform_probs_mat.py \ + --priors="$post_vec" $transform_probs_opts > $seg_dir/transform_probs.mat + + steps/segmentation/decode_sad.sh --acwt $acwt --cmd "$cmd" \ + --nj $nj \ + --transform "$seg_dir/transform_probs.mat" \ + $graph_dir $sad_dir $seg_dir +fi + +############################################################################### +## Post-process segmentation to create kaldi data directory. +############################################################################### + +if [ $stage -le 5 ]; then + steps/segmentation/post_process_sad_to_segments.sh \ + --segment-padding $segment_padding --min-segment-dur $min_segment_dur \ + --merge-consecutive-max-dur $merge_consecutive_max_dur \ + --cmd "$cmd" --frame-shift $(perl -e "print $frame_subsampling_factor * $frame_shift") \ + ${src_data_dir} ${seg_dir} ${seg_dir} +fi + +sed 's:-:_:g' ${seg_dir}/segments > $src_data_dir/segments # to be consistent for scoring + +if [ $cleanup ]; then + rm $src_data_dir/{feats.scp,frame_shift,utt2dur,utt2num_frames} 2> /dev/null +fi + +echo "$0: Created output segments in ${src_data_dir}" +exit 0 \ No newline at end of file diff --git a/egs/libri_css/s5_mono/local/diarization/post_process_css_rttm.py b/egs/libri_css/s5_mono/local/diarization/post_process_css_rttm.py new file mode 100755 index 00000000000..b90ad3d97b9 --- /dev/null +++ b/egs/libri_css/s5_mono/local/diarization/post_process_css_rttm.py @@ -0,0 +1,121 @@ +#! /usr/bin/env python3 +# Copyright 2020 Desh Raj +# Apache 2.0. +"""This script takes an RTTM file and removes same-speaker segments +which may be present at the same time across streams. This is meant +to be used as a post-processing step after performing clustering-based +diarization on top of separated streams of audio. The idea is to +eliminate false alarms caused by leakage, since the separation +method may not be perfect.""" + +import argparse, os +import itertools +from collections import defaultdict + +def get_args(): + parser = argparse.ArgumentParser( + description="""This script takes an RTTM file and removes same-speaker segments + which may be present at the same time across streams. This is meant + to be used as a post-processing step after performing clustering-based + diarization on top of separated streams of audio. The idea is to + eliminate false alarms caused by leakage, since the separation + method may not be perfect.""", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("input_rttm", type=str, + help="path of input rttm file") + parser.add_argument("output_rttm", type=str, + help="path of output rttm file") + args = parser.parse_args() + return args + +class Segment: + def __init__(self, parts): + self.reco_id = '_'.join(parts[1].split('_')[:-1]) + self.stream = int(parts[1].split('_')[-1]) + self.start_time = float(parts[3]) + self.duration = float(parts[4]) + self.end_time = self.start_time + self.duration + self.label = int(parts[7]) + + +def main(): + args = get_args() + + # First we read all segments and store as a list of objects + segments = [] + with open(args.input_rttm,'r') as f: + for line in f.readlines(): + parts = line.strip().split() + segments.append(Segment(parts)) + + groupfn = lambda x: (x.reco_id,x.label) + sort(segments, key=groupfn) + # We group the segment list into a dictionary indexed by (reco_id, spk_id) + reco_and_spk_to_segs = defaultdict(list, + {uid : list(g) for uid, g in itertools.groupby(segments, groupfn)}) + + reco_and_spk_to_final_segs = {} + for uid in reco_and_spk_to_segs.keys(): + reco_id, spk_id = uid + segs = reco_and_spk_to_segs[uid] + tokens = [] + for seg in segs: + tokens.append(('BEG',seg.start_time,seg.stream)) + tokens.append(('END',seg.end_time,seg.stream)) + tokens.sort(key=lambda x:x[1]) + + # Remove segments which lie completely inside another segment + running_segs = {} + new_segs = [] # (start_time, end_time, stream) + for token in tokens: + if token[0] == 'BEG': + running_segs[token[2]] = token[1] + else: + seg_start = running_segs[token[2]] + seg_end = token[1] + seg_stream = token[2] + new_seg = (seg_start, seg_end, seg_stream) + del running_segs[token[2]] + + # if this segment was the only running segment, then append + if len(running_segs) == 0: + new_segs.append(new_seg) + continue + + # if any running segment started before this one, it means, this + # segment is totally enclosed within the other, so we don't add it + if not any(i < new_seg[0] for i in running_segs.values()): + new_segs.append(new_seg) + + new_segs.sort(key=lambda x: x[0]) + num_segs = len(new_segs) + # Now we have partially overlapping segments. We divide the overlapping + # portion equally. + final_segs = [] # (start_time, end_time, stream) + for i in range(num_segs): + seg = new_segs[i] + # If it is last segment in recording or last contiguous segment, add it to new_segs + if (i == num_segs-1 or seg[1] <= new_segs[i+1][0]): + final_segs.append(seg) + # Otherwise split overlapping interval between current and next segment + else: + avg = (new_segs[i+1][0] + seg[1]) / 2 + final_segs.append((seg[0], avg, seg[2])) + if not (avg < new_segs[i+1][1]): + print (reco_id, spk_id, seg, new_segs[i+1]) + new_segs[i+1] = (avg, new_segs[i+1][1], new_segs[i+1][2]) + new_segs[i+1:].sort(key=lambda x: x[0]) + reco_and_spk_to_final_segs[(reco_id, spk_id)] = final_segs + + rttm_str = "SPEAKER {0} 1 {1:7.3f} {2:7.3f} {3} \n" + with open(args.output_rttm, 'w') as f: + for (reco_id, spk_id) in sorted(reco_and_spk_to_final_segs): + segs = reco_and_spk_to_final_segs[(reco_id, spk_id)] + for seg in segs: + utt_id = "{}_{}".format(reco_id, seg[2]) + dur = seg[1] - seg[0] + if dur > 0.025: + f.write(rttm_str.format(utt_id, seg[0], dur, spk_id)) + +if __name__ == '__main__': + main() diff --git a/egs/libri_css/s5_mono/local/diarization/scluster.sh b/egs/libri_css/s5_mono/local/diarization/scluster.sh new file mode 100755 index 00000000000..374ec192031 --- /dev/null +++ b/egs/libri_css/s5_mono/local/diarization/scluster.sh @@ -0,0 +1,100 @@ +#!/bin/bash + +# Copyright 2016 David Snyder +# 2017-2018 Matthew Maciejewski +# 2020 Maxim Korenevsky (STC-innovations Ltd) +# Apache 2.0. + +# This script performs spectral clustering using scored +# pairs of subsegments and produces a rttm file with speaker +# labels derived from the clusters. + +# Begin configuration section. +cmd="run.pl" +stage=0 +nj=10 +cleanup=true +rttm_channel=0 +reco2num_spk= +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 2 ]; then + echo "Usage: $0 " + echo " e.g.: $0 exp/ivectors_callhome exp/ivectors_callhome/results" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --nj # Number of jobs (also see num-processes and num-threads)" + echo " --stage # To control partial reruns" + echo " --rttm-channel # The value passed into the RTTM channel field. Only affects" + echo " # the format of the RTTM file." + echo " --reco2num-spk # File containing mapping of recording ID" + echo " # to number of speakers. Used instead of threshold" + echo " # as stopping criterion if supplied." + echo " --cleanup # If true, remove temporary files" + exit 1; +fi + +srcdir=$1/cossim_scores +xvec_dir=$1 +dir=$2 + +mkdir -p $dir/tmp + +for f in $srcdir/scores.scp $srcdir/spk2utt $srcdir/utt2spk $xvec_dir/segments.bak ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +cp $srcdir/spk2utt $dir/tmp/ +cp $srcdir/utt2spk $dir/tmp/ +cp $srcdir/segments $dir/tmp/ +utils/fix_data_dir.sh $dir/tmp > /dev/null + +if [ ! -z $reco2num_spk ]; then + reco2num_spk="ark,t:$reco2num_spk" +fi + +sdata=$dir/tmp/split$nj; +utils/split_data.sh $dir/tmp $nj || exit 1; + +# Set various variables. +mkdir -p $dir/log + +feats="utils/filter_scp.pl $sdata/JOB/spk2utt $srcdir/scores.scp |" + +reco2num_spk_opt= +if [ ! $reco2num_spk == "" ]; then + reco2num_spk_opt="--reco2num-spk $reco2num_spk" +fi + +if [ $stage -le 0 ]; then + echo "$0: clustering scores" + for j in `seq $nj`; do + utils/filter_scp.pl $sdata/$j/spk2utt $srcdir/scores.scp > $dir/scores.$j.scp + done + $cmd JOB=1:$nj $dir/log/spectral_cluster.JOB.log \ + diarization/spec_clust.py $reco2num_spk_opt \ + scp:$dir/scores.JOB.scp ark,t:$sdata/JOB/spk2utt ark,t:$dir/labels.JOB || exit 1; +fi + +if [ $stage -le 1 ]; then + echo "$0: combining labels" + for j in $(seq $nj); do cat $dir/labels.$j; done > $dir/labels || exit 1; +fi + +# Note that here we use the segments.bak file which contains mapping from subsegments to original stream +# This is done to ensure that segments do not cross streams (since we will perform ASR on them later) +if [ $stage -le 2 ]; then + echo "$0: computing RTTM" + diarization/make_rttm.py --rttm-channel $rttm_channel $xvec_dir/segments.bak $dir/labels $dir/rttm || exit 1; +fi + +if $cleanup ; then + rm -r $dir/tmp || exit 1; +fi \ No newline at end of file diff --git a/egs/libri_css/s5_mono/local/diarize.sh b/egs/libri_css/s5_mono/local/diarize.sh new file mode 100755 index 00000000000..83e6fe72267 --- /dev/null +++ b/egs/libri_css/s5_mono/local/diarize.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# Copyright 2019 David Snyder +# 2020 Desh Raj + +# Apache 2.0. +# +# This script takes an input directory that has a segments file (and +# a feats.scp file), and performs diarization on it. The output directory +# contains an RTTM file which can be used to resegment the input data. + +stage=0 +nj=10 +cmd="run.pl" +ref_rttm= +score_overlaps_only=true + +echo "$0 $@" # Print the command line for logging + +set -e + +. ./path.sh +. parse_options.sh + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 exp/xvector_nnet_1a data/dev exp/dev_diarization" + echo "Options: " + echo " --nj # number of parallel jobs." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --ref_rttm ./local/dev_rttm # the location of the reference RTTM file" + exit 1; +fi + +model_dir=$1 +data_in=$2 +out_dir=$3 + +name=$(basename "$data_in") + +for f in $data_in/feats.scp $data_in/segments $model_dir/plda \ + $model_dir/final.raw $model_dir/extract.config; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +if [ $stage -le 1 ]; then + echo "$0: computing features for x-vector extractor" + utils/fix_data_dir.sh data/${name} + rm -rf data/${name}_cmn + local/nnet3/xvector/prepare_feats.sh --nj $nj --cmd "$cmd" \ + data/$name data/${name}_cmn exp/${name}_cmn + cp data/$name/segments exp/${name}_cmn/ + utils/fix_data_dir.sh data/${name}_cmn +fi + +if [ $stage -le 2 ]; then + echo "$0: extracting x-vectors for all segments" + diarization/nnet3/xvector/extract_xvectors.sh --cmd "$cmd" \ + --nj $nj --window 1.5 --period 0.75 --apply-cmn false \ + --min-segment 0.5 $model_dir \ + data/${name}_cmn $out_dir/xvectors_${name} +fi + +# Perform PLDA scoring +if [ $stage -le 3 ]; then + # Perform PLDA scoring on all pairs of segments for each recording. + echo "$0: performing PLDA scoring between all pairs of x-vectors" + diarization/nnet3/xvector/score_plda.sh --cmd "$cmd" \ + --target-energy 0.5 \ + --nj $nj $model_dir/ $out_dir/xvectors_${name} \ + $out_dir/xvectors_${name}/plda_scores +fi + +if [ $stage -le 4 ]; then + echo "$0: performing clustering using PLDA scores (threshold tuned on dev)" + diarization/cluster.sh --cmd "$cmd" --nj $nj \ + --rttm-channel 1 --threshold 0.4 \ + $out_dir/xvectors_${name}/plda_scores $out_dir + echo "$0: wrote RTTM to output directory ${out_dir}" +fi + +hyp_rttm=${out_dir}/rttm + +if [ $stage -le 5 ]; then + echo "Diarization results for "${name} + local/dscore.sh --score-overlaps-only $score_overlaps_only \ + $ref_rttm $hyp_rttm +fi diff --git a/egs/libri_css/s5_mono/local/diarize_css.sh b/egs/libri_css/s5_mono/local/diarize_css.sh new file mode 100755 index 00000000000..4808f763a59 --- /dev/null +++ b/egs/libri_css/s5_mono/local/diarize_css.sh @@ -0,0 +1,131 @@ +#!/bin/bash +# Copyright 2019 David Snyder +# 2020 Desh Raj + +# Apache 2.0. +# +# This script is exactly the same as local/diarize.sh until +# stage 2 (x-vector extraction), but after that, it is slightly +# different. The key difference is that since we have multiple +# streams of audio (and subsequently multiple streams of subsegments) +# from the same recording, we want to perform cosine scoring across +# all of these streams. + +stage=0 +nj=10 +cmd="run.pl" +ref_rttm= +window=1.5 +period=0.75 +min_segment=0.5 +post_process_rttm=false # set to true to remove same speaker segments in different + # streams at the same time +score_overlaps_only=true + +echo "$0 $@" # Print the command line for logging + +set -e + +. ./path.sh +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 exp/xvector_nnet_1a data/dev exp/dev_diarization" + echo "Options: " + echo " --nj # number of parallel jobs." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --ref_rttm ./local/dev_rttm # the location of the reference RTTM file" + exit 1; +fi + +model_dir=$1 +data_in=$2 +out_dir=$3 + +name=$(basename "$data_in") + +for f in $data_in/feats.scp $data_in/segments \ + $model_dir/final.raw $model_dir/extract.config; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +if [ $stage -le 1 ]; then + echo "$0: computing features for x-vector extractor" + utils/fix_data_dir.sh data/${name} + rm -rf data/${name}_cmn + local/nnet3/xvector/prepare_feats.sh --nj $nj --cmd "$cmd" \ + data/$name data/${name}_cmn exp/${name}_cmn + cp data/$name/segments exp/${name}_cmn/ + utils/fix_data_dir.sh data/${name}_cmn +fi + +if [ $stage -le 2 ]; then + echo "$0: extracting x-vectors for all segments" + diarization/nnet3/xvector/extract_xvectors.sh --cmd "$cmd" \ + --nj $nj --window $window --period $period --apply-cmn false \ + --min-segment $min_segment $model_dir \ + data/${name}_cmn $out_dir/xvectors_${name} +fi + +# Perform cosine scoring. The following stage is the key difference. +# We change the segments and utt2spk files in the xvector directory +# to reflect that the subsegments are from the same recording. +# But we also keep the original segments file since that will +# be required in subsequent stages for ASR decoding. +if [ $stage -le 3 ]; then + # The if condition is just to ensure that we don't accidentally + # make this modification more than once (which would mess up the + # segments file) + if [ ! -f ${out_dir}/xvectors_${name}/segments.bak ]; then + mv ${out_dir}/xvectors_${name}/segments ${out_dir}/xvectors_${name}/segments.bak + mv ${out_dir}/xvectors_${name}/utt2spk ${out_dir}/xvectors_${name}/utt2spk.bak + awk '{$2=$2;sub(/_[0-9]*$/, "", $2); print}' ${out_dir}/xvectors_${name}/segments.bak \ + > ${out_dir}/xvectors_${name}/segments + awk '{$2=$2;sub(/_[0-9]*$/, "", $2); print}' ${out_dir}/xvectors_${name}/utt2spk.bak \ + > ${out_dir}/xvectors_${name}/utt2spk + utils/utt2spk_to_spk2utt.pl ${out_dir}/xvectors_${name}/utt2spk > ${out_dir}/xvectors_${name}/spk2utt + fi +fi + +# nj needs to be changed since we now have #wav/#streams number +# of recordings. Just get it from the segments file +new_nj=$(cat ${out_dir}/xvectors_${name}/segments | cut -d' ' -f2 | uniq | wc -l) +nj=$(echo $((nj>new_nj ? new_nj : nj))) + +if [ $stage -le 4 ]; then + # Perform cosine similarity scoring on all pairs of segments for each recording. + echo "$0: performing cosine similarity scoring between all pairs of x-vectors" + diarization/score_cossim.sh --cmd "$cmd" \ + --nj $nj $out_dir/xvectors_${name} \ + $out_dir/xvectors_${name}/cossim_scores +fi + +if [ $stage -le 5 ]; then + echo "$0: performing spectral clustering using cosine similarity scores" + local/diarization/scluster.sh --cmd "$cmd" --nj $nj \ + --rttm-channel 1 \ + $out_dir/xvectors_${name} $out_dir + echo "$0: wrote RTTM to output directory ${out_dir}" + + # The above clustering generates RTTM with reco separated into streams, + # so we have to remove the stream name for evaluation. + awk '{$2=$2;sub(/_[0-9]*$/, "", $2); print}' ${out_dir}/rttm \ + > ${out_dir}/rttm.comb +fi + +if [ $stage -le 6 ] && [ $post_process_rttm == "true" ]; then + echo "$0: applying post-processing to remove simultaneous same-speaker segments" + local/diarization/post_process_css_rttm.py ${out_dir}/rttm ${out_dir}/rttm.post + + awk '{$2=$2;sub(/_[0-9]*$/, "", $2); print}' ${out_dir}/rttm.post \ + > ${out_dir}/rttm.comb +fi + +hyp_rttm=${out_dir}/rttm.comb + +if [ $stage -le 7 ]; then + echo "Diarization results for "${name} + local/dscore.sh --score-overlaps-only $score_overlaps_only \ + $ref_rttm $hyp_rttm +fi diff --git a/egs/libri_css/s5_mono/local/diarize_spectral.sh b/egs/libri_css/s5_mono/local/diarize_spectral.sh new file mode 100755 index 00000000000..0b12e5a57ea --- /dev/null +++ b/egs/libri_css/s5_mono/local/diarize_spectral.sh @@ -0,0 +1,81 @@ +#!/bin/bash +# Copyright 2019 David Snyder +# 2020 Desh Raj + +# Apache 2.0. +# +# This is similar to local/diarize.sh but uses spectral clustering instead +# of AHC. + +stage=0 +nj=10 +cmd="run.pl" +ref_rttm= +score_overlaps_only=true + +echo "$0 $@" # Print the command line for logging +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 exp/xvector_nnet_1a data/dev exp/dev_diarization" + echo "Options: " + echo " --nj # number of parallel jobs." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --ref_rttm ./local/dev_rttm # the location of the reference RTTM file" + exit 1; +fi + +model_dir=$1 +data_in=$2 +out_dir=$3 + +name=$(basename "$data_in") + +for f in $data_in/feats.scp $data_in/segments \ + $model_dir/final.raw $model_dir/extract.config; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +if [ $stage -le 1 ]; then + echo "$0: computing features for x-vector extractor" + utils/fix_data_dir.sh data/${name} + rm -rf data/${name}_cmn + local/nnet3/xvector/prepare_feats.sh --nj $nj --cmd "$cmd" \ + data/$name data/${name}_cmn exp/${name}_cmn + cp data/$name/segments exp/${name}_cmn/ + utils/fix_data_dir.sh data/${name}_cmn +fi + +if [ $stage -le 2 ]; then + echo "$0: extracting x-vectors for all segments" + diarization/nnet3/xvector/extract_xvectors.sh --cmd "$cmd" \ + --nj $nj --window 1.5 --period 0.75 --apply-cmn false \ + --min-segment 0.5 $model_dir \ + data/${name}_cmn $out_dir/xvectors_${name} +fi + +# Perform cosine similarity scoring +if [ $stage -le 3 ]; then + # Perform cosine similarity scoring on all pairs of segments for each recording. + echo "$0: performing cosine similarity scoring between all pairs of x-vectors" + diarization/score_cossim.sh --cmd "$cmd" \ + --nj $nj $out_dir/xvectors_${name} \ + $out_dir/xvectors_${name}/cossim_scores +fi + +if [ $stage -le 4 ]; then + echo "$0: performing spectral clustering using cosine similarity scores" + diarization/scluster.sh --cmd "$cmd" --nj $nj \ + --rttm-channel 1 \ + $out_dir/xvectors_${name}/cossim_scores $out_dir + echo "$0: wrote RTTM to output directory ${out_dir}" +fi + +hyp_rttm=${out_dir}/rttm + +if [ $stage -le 5 ]; then + echo "Diarization results for "${name} + local/dscore.sh --score-overlaps-only $score_overlaps_only \ + $ref_rttm $hyp_rttm +fi diff --git a/egs/libri_css/s5_mono/local/download_and_untar.sh b/egs/libri_css/s5_mono/local/download_and_untar.sh new file mode 100755 index 00000000000..5cf6adde8bc --- /dev/null +++ b/egs/libri_css/s5_mono/local/download_and_untar.sh @@ -0,0 +1,100 @@ +#!/usr/bin/env bash + +# Copyright 2014 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 + +remove_archive=false + +if [ "$1" == --remove-archive ]; then + remove_archive=true + shift +fi + +if [ $# -ne 3 ]; then + echo "Usage: $0 [--remove-archive] " + echo "e.g.: $0 /export/a15/vpanayotov/data www.openslr.org/resources/11 dev-clean" + echo "With --remove-archive it will remove the archive after successfully un-tarring it." + echo " can be one of: dev-clean, test-clean, dev-other, test-other," + echo " train-clean-100, train-clean-360, train-other-500." +fi + +data=$1 +url=$2 +part=$3 + +if [ ! -d "$data" ]; then + echo "$0: no such directory $data" + exit 1; +fi + +part_ok=false +list="dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500" +for x in $list; do + if [ "$part" == $x ]; then part_ok=true; fi +done +if ! $part_ok; then + echo "$0: expected to be one of $list, but got '$part'" + exit 1; +fi + +if [ -z "$url" ]; then + echo "$0: empty URL base." + exit 1; +fi + +if [ -f $data/LibriSpeech/$part/.complete ]; then + echo "$0: data part $part was already successfully extracted, nothing to do." + exit 0; +fi + + +# sizes of the archive files in bytes. This is some older versions. +sizes_old="371012589 347390293 379743611 361838298 6420417880 23082659865 30626749128" +# sizes_new is the archive file sizes of the final release. Some of these sizes are of +# things we probably won't download. +sizes_new="337926286 314305928 695964615 297279345 87960560420 33373768 346663984 328757843 6387309499 23049477885 30593501606" + +if [ -f $data/$part.tar.gz ]; then + size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}') + size_ok=false + for s in $sizes_old $sizes_new; do if [ $s == $size ]; then size_ok=true; fi; done + if ! $size_ok; then + echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size" + echo "does not equal the size of one of the archives." + rm $data/$part.tar.gz + else + echo "$data/$part.tar.gz exists and appears to be complete." + fi +fi + +pushd $data + +if [ ! -f $part.tar.gz ]; then + if ! which wget >/dev/null; then + echo "$0: wget is not installed." + exit 1; + fi + full_url=$url/$part.tar.gz + echo "$0: downloading data from $full_url. This may take some time, please be patient." + + if ! wget --no-check-certificate $full_url; then + echo "$0: error executing wget $full_url" + exit 1; + fi +fi + +if ! tar -xvzf $part.tar.gz; then + echo "$0: error un-tarring archive $data/$part.tar.gz" + exit 1; +fi + +popd >&/dev/null + +touch $data/LibriSpeech/$part/.complete + +echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz" + +if $remove_archive; then + echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied." + rm $data/$part.tar.gz +fi diff --git a/egs/libri_css/s5_mono/local/download_diarizer.sh b/egs/libri_css/s5_mono/local/download_diarizer.sh new file mode 100755 index 00000000000..a0ef096e10d --- /dev/null +++ b/egs/libri_css/s5_mono/local/download_diarizer.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# +# Copyright 2020 University of Stuttgart (Author: Pavel Denisov) +# Apache 2.0 + +# Begin configuration section. +# End configuration section +. ./utils/parse_options.sh # accept options + +. ./path.sh + +echo >&2 "$0" "$@" +if [ $# -ne 0 ] ; then + echo >&2 "$0" "$@" + echo >&2 "$0: Error: wrong number of arguments" + echo -e >&2 "Usage:\n $0" + exit 1 +fi + + +set -e -o pipefail + +mkdir -p downloads +dir=$(mktemp -d ./downloads/lcss.XXXXXXXXX) +trap "rm -rf ${dir}" EXIT + +cd ${dir} + +# Download x-vector extractor trained on VocxCeleb2 data +wget http://kaldi-asr.org/models/12/0012_diarization_v1.tar.gz +tar -xvzf 0012_diarization_v1.tar.gz +rm -f 0012_diarization_v1.tar.gz + +# Download PLDA model trained on augmented Librispeech data +rm 0012_diarization_v1/exp/xvector_nnet_1a/plda +wget https://desh2608.github.io/static/files/jsalt/plda -P 0012_diarization_v1/exp/xvector_nnet_1a/ +cd ../.. +cp -r ${dir}/0012_diarization_v1/exp . diff --git a/egs/libri_css/s5_mono/local/download_lm.sh b/egs/libri_css/s5_mono/local/download_lm.sh new file mode 100755 index 00000000000..129ca1edbe3 --- /dev/null +++ b/egs/libri_css/s5_mono/local/download_lm.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash + +# Copyright 2014 Vassil Panayotov +# Apache 2.0 + +if [ $# -ne "2" ]; then + echo "Usage: $0 " + echo "e.g.: $0 http://www.openslr.org/resources/11 data/local/lm" + exit 1 +fi + +base_url=$1 +dst_dir=$2 + +# given a filename returns the corresponding file size in bytes +# The switch cases below can be autogenerated by entering the data directory and running: +# for f in *; do echo "\"$f\") echo \"$(du -b $f | awk '{print $1}')\";;"; done +function filesize() { + case $1 in + "3-gram.arpa.gz") echo "759636181";; + "3-gram.pruned.1e-7.arpa.gz") echo "34094057";; + "3-gram.pruned.3e-7.arpa.gz") echo "13654242";; + "4-gram.arpa.gz") echo "1355172078";; + "g2p-model-5") echo "20098243";; + "librispeech-lexicon.txt") echo "5627653";; + "librispeech-lm-corpus.tgz") echo "1803499244";; + "librispeech-lm-norm.txt.gz") echo "1507274412";; + "librispeech-vocab.txt") echo "1737588";; + *) echo "";; + esac +} + +function check_and_download () { + [[ $# -eq 1 ]] || { echo "check_and_download() expects exactly one argument!"; return 1; } + fname=$1 + echo "Downloading file '$fname' into '$dst_dir'..." + expect_size="$(filesize $fname)" + [[ ! -z "$expect_size" ]] || { echo "Unknown file size for '$fname'"; return 1; } + if [[ -s $dst_dir/$fname ]]; then + # In the following statement, the first version works on linux, and the part + # after '||' works on Linux. + f=$dst_dir/$fname + fsize=$(set -o pipefail; du -b $f 2>/dev/null | awk '{print $1}' || stat '-f %z' $f) + if [[ "$fsize" -eq "$expect_size" ]]; then + echo "'$fname' already exists and appears to be complete" + return 0 + else + echo "WARNING: '$fname' exists, but the size is wrong - re-downloading ..." + fi + fi + wget --no-check-certificate -O $dst_dir/$fname $base_url/$fname || { + echo "Error while trying to download $fname!" + return 1 + } + f=$dst_dir/$fname + # In the following statement, the first version works on linux, and the part after '||' + # works on Linux. + fsize=$(set -o pipefail; du -b $f 2>/dev/null | awk '{print $1}' || stat '-f %z' $f) + [[ "$fsize" -eq "$expect_size" ]] || { echo "$fname: file size mismatch!"; return 1; } + return 0 +} + +mkdir -p $dst_dir + +for f in 3-gram.arpa.gz 3-gram.pruned.1e-7.arpa.gz 3-gram.pruned.3e-7.arpa.gz 4-gram.arpa.gz \ + g2p-model-5 librispeech-lm-corpus.tgz librispeech-vocab.txt librispeech-lexicon.txt; do + check_and_download $f || exit 1 +done + +cd $dst_dir +ln -sf 3-gram.pruned.1e-7.arpa.gz lm_tgmed.arpa.gz +ln -sf 3-gram.pruned.3e-7.arpa.gz lm_tgsmall.arpa.gz +ln -sf 3-gram.arpa.gz lm_tglarge.arpa.gz +ln -sf 4-gram.arpa.gz lm_fglarge.arpa.gz + +exit 0 diff --git a/egs/libri_css/s5_mono/local/dscore.sh b/egs/libri_css/s5_mono/local/dscore.sh new file mode 100644 index 00000000000..43665aba4a2 --- /dev/null +++ b/egs/libri_css/s5_mono/local/dscore.sh @@ -0,0 +1,70 @@ +#!/bin/bash +# Copyright 2020 Desh Raj + +# Apache 2.0. +# +# This script installs a fork of the dscore toolkit +# (https://github.com/nryant/dscore), which also supports +# evaluating the overlapping regions only. It then scores +# the output sys_rttm based on the provided ref_rttm. + +score_overlaps_only=true + +echo "$0 $@" # Print the command line for logging + +set -e + +. ./path.sh +. parse_options.sh || exit 1; + +if [ $# != 2 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/test/rttm exp/test_diarization/rttm" + exit 1; +fi + +ref_rttm=$1 +hyp_rttm=$2 + +if ! [ -d dscore ]; then + git clone https://github.com/desh2608/dscore.git -b libricss --single-branch + cd dscore + python3 -m pip install --user -r requirements.txt + cd .. +fi + +# Create per condition ref and hyp RTTM files for scoring per condition +mkdir -p tmp +trap "rm -r tmp" EXIT + +conditions="0L 0S OV10 OV20 OV30 OV40" +cp $ref_rttm tmp/ref.all +cp $hyp_rttm tmp/hyp.all +for rttm in ref hyp; do + for cond in $conditions; do + cat tmp/$rttm.all | grep $cond > tmp/$rttm.$cond + done +done + +echo "Scoring all regions..." +for cond in $conditions 'all'; do + echo -n "Condition: $cond: " + ref_rttm_path=$(readlink -f tmp/ref.$cond) + hyp_rttm_path=$(readlink -f tmp/hyp.$cond) + cd dscore + python3 score.py -r $ref_rttm_path -s $hyp_rttm_path --global_only + cd .. +done + +# We also score overlapping regions only +if [ $score_overlaps_only == "true" ]; then + echo "Scoring overlapping regions..." + for cond in $conditions 'all'; do + echo -n "Condition: $cond: " + ref_rttm_path=$(readlink -f tmp/ref.$cond) + hyp_rttm_path=$(readlink -f tmp/hyp.$cond) + cd dscore + python3 score.py -r $ref_rttm_path -s $hyp_rttm_path --overlap_only --global_only + cd .. + done +fi diff --git a/egs/libri_css/s5_mono/local/extract_vad_weights.sh b/egs/libri_css/s5_mono/local/extract_vad_weights.sh new file mode 100755 index 00000000000..d5019f100b1 --- /dev/null +++ b/egs/libri_css/s5_mono/local/extract_vad_weights.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash + +# Copyright 2016 Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti) +# 2019 Vimal Manohar +# Apache 2.0. + +# This script converts lattices available from a first pass decode into a per-frame weights file +# The ctms generated from the lattices are filtered. Silence frames are assigned a low weight (e.g.0.00001) +# and voiced frames have a weight of 1. + +set -e + +stage=1 +cmd=run.pl +silence_weight=0.00001 +#end configuration section. + +. ./cmd.sh + +[ -f ./path.sh ] && . ./path.sh +. utils/parse_options.sh || exit 1; +if [ $# -ne 4 ]; then + echo "Usage: $0 [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + exit 1; +fi + +data_dir=$1 +lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. +decode_dir=$3 +output_wts_file_gz=$4 + +if [ $stage -le 1 ]; then + echo "$0: generating CTM from input lattices" + steps/get_ctm_conf.sh --cmd "$cmd" \ + --use-segments false \ + $data_dir \ + $lang \ + $decode_dir +fi + +if [ $stage -le 2 ]; then + name=`basename $data_dir` + # we just take the ctm from LMWT 10, it doesn't seem to affect the results a lot + ctm=$decode_dir/score_10/$name.ctm + echo "$0: generating weights file from ctm $ctm" + + pad_frames=0 # this did not seem to be helpful but leaving it as an option. + feat-to-len scp:$data_dir/feats.scp ark,t:- >$decode_dir/utt.lengths + if [ ! -f $ctm ]; then echo "$0: expected ctm to exist: $ctm"; exit 1; fi + + cat $ctm | awk '$6 == 1.0 && $4 < 1.0' | \ + grep -v -w mm | grep -v -w mhm | grep -v -F '[noise]' | \ + grep -v -F '[laughter]' | grep -v -F '' | \ + perl -e ' $lengths=shift @ARGV; $pad_frames=shift @ARGV; $silence_weight=shift @ARGV; + $pad_frames >= 0 || die "bad pad-frames value $pad_frames"; + open(L, "<$lengths") || die "opening lengths file"; + @all_utts = (); + $utt2ref = { }; + while () { + ($utt, $len) = split(" ", $_); + push @all_utts, $utt; + $array_ref = [ ]; + for ($n = 0; $n < $len; $n++) { ${$array_ref}[$n] = $silence_weight; } + $utt2ref{$utt} = $array_ref; + } + while () { + @A = split(" ", $_); + @A == 6 || die "bad ctm line $_"; + $utt = $A[0]; $beg = $A[2]; $len = $A[3]; + $beg_int = int($beg * 100) - $pad_frames; + $len_int = int($len * 100) + 2*$pad_frames; + $array_ref = $utt2ref{$utt}; + !defined $array_ref && die "No length info for utterance $utt"; + for ($t = $beg_int; $t < $beg_int + $len_int; $t++) { + if ($t >= 0 && $t < @$array_ref) { + ${$array_ref}[$t] = 1; + } + } + } + foreach $utt (@all_utts) { $array_ref = $utt2ref{$utt}; + print $utt, " [ ", join(" ", @$array_ref), " ]\n"; + } ' $decode_dir/utt.lengths $pad_frames $silence_weight | \ + gzip -c > $output_wts_file_gz +fi diff --git a/egs/libri_css/s5_mono/local/format_lms.sh b/egs/libri_css/s5_mono/local/format_lms.sh new file mode 100755 index 00000000000..d1a18bada88 --- /dev/null +++ b/egs/libri_css/s5_mono/local/format_lms.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash + +# Copyright 2014 Vassil Panayotov +# Apache 2.0 + +# Prepares the test time language model(G) transducers +# (adapted from wsj/s5/local/wsj_format_data.sh) + +. ./path.sh || exit 1; + +# begin configuration section +src_dir=data/lang +# end configuration section + +. utils/parse_options.sh || exit 1; + +set -e + +if [ $# -ne 1 ]; then + echo "Usage: $0 " + echo "e.g.: $0 /export/a15/vpanayotov/data/lm" + echo ", where:" + echo " is the directory in which the language model is stored/downloaded" + echo "Options:" + echo " --src-dir # source lang directory, default data/lang" + exit 1 +fi + +lm_dir=$1 + +if [ ! -d $lm_dir ]; then + echo "$0: expected source LM directory $lm_dir to exist" + exit 1; +fi +if [ ! -f $src_dir/words.txt ]; then + echo "$0: expected $src_dir/words.txt to exist." + exit 1; +fi + + +tmpdir=data/local/lm_tmp.$$ +trap "rm -r $tmpdir" EXIT + +mkdir -p $tmpdir + +for lm_suffix in tgsmall tgmed; do + # tglarge is prepared by a separate command, called from run.sh; we don't + # want to compile G.fst for tglarge, as it takes a while. + test=${src_dir}_test_${lm_suffix} + mkdir -p $test + cp -r ${src_dir}/* $test + gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \ + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst + utils/validate_lang.pl --skip-determinization-check $test || exit 1; +done + +echo "Succeeded in formatting data." + +exit 0 diff --git a/egs/libri_css/s5_mono/local/get_perspeaker_output.py b/egs/libri_css/s5_mono/local/get_perspeaker_output.py new file mode 100755 index 00000000000..fcf60f708a2 --- /dev/null +++ b/egs/libri_css/s5_mono/local/get_perspeaker_output.py @@ -0,0 +1,91 @@ +#! /usr/bin/env python3 +# Copyright 2020 Desh Raj +# Apache 2.0. +"""This script splits a kaldi output (text) file + into per_speaker output (text) file""" + +import argparse, os +import itertools +from collections import defaultdict + +def get_args(): + parser = argparse.ArgumentParser( + description="""This script splits a kaldi text file + into per_speaker text files""", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--affix", type=str, + help="Append in front of output file") + parser.add_argument("--multi-stream", dest='multi_stream', action='store_true', + default=False, + help="Score with multiple decoding streams e.g. CSS") + parser.add_argument("input_text", type=str, + help="path of text file") + parser.add_argument("input_utt2spk", type=str, + help="path of utt2spk file") + parser.add_argument("output_dir", type=str, + help="Output path for per_session per_speaker reference files") + args = parser.parse_args() + return args + +class Utterance: + """Stores all information about an utterance""" + reco_id = '' + spk_id = '' + text = '' + start_time = 0 + end_time = 0 + + def __init__(self, uttid, spkid, text, multi_stream): + parts = uttid.strip().split('_') + self.reco_id = '_'.join(parts[1:4]) + if not multi_stream: + self.start_time = float(parts[4])/100 + self.end_time = float(parts[5])/100 + else: + self.start_time = float(parts[5])/100 + self.end_time = float(parts[6])/100 + self.spk_id = spkid + self.text = text + +def main(): + args = get_args() + utt2spk = {} + utt_list = [] + + # First we read the utt2spk file and create a mapping + for line in open(args.input_utt2spk): + uttid, spkid = line.strip().split() + utt2spk[uttid] = spkid + + # Next we read the input text file and create a list of + # Utterance class objects + for line in open(args.input_text): + parts = line.strip().split(maxsplit=1) + uttid = parts[0] + text = "" if len(parts) == 1 else parts[1] + utterance = Utterance(uttid, utt2spk[uttid], text, args.multi_stream) + utt_list.append(utterance) + + groupfn = lambda x: (x.reco_id, x.spk_id) + sort(utt_list, key=groupfn) + # We group the utterance list into a dictionary indexed by (reco_id, spk_id) + reco_spk_to_utts = defaultdict(list, + {uid : list(g) for uid, g in itertools.groupby(utt_list, groupfn)}) + + # Now for each (reco_id, spk_id) pair, we write the concatenated text to an + # output (we assign speaker ids 1,2,3,..) + for i, uid in enumerate(sorted(reco_spk_to_utts.keys())): + reco_id = reco_spk_to_utts[uid][0].reco_id + output_file = os.path.join(args.output_dir, '{}_{}_{}_comb'.format(args.affix, i, reco_id)) + output_writer = open(output_file, 'w') + utterances = reco_spk_to_utts[uid] + + # We sort all utterances by start time and concatenate. + sorted_utterances = sorted(utterances, key=lambda x: x.start_time) + combined_text = ' '.join([utt.text for utt in sorted_utterances]) + + output_writer.write("{} {}".format(reco_id, combined_text)) + output_writer.close() + +if __name__ == '__main__': + main() diff --git a/egs/libri_css/s5_mono/local/make_voxceleb1.pl b/egs/libri_css/s5_mono/local/make_voxceleb1.pl new file mode 100755 index 00000000000..2268c20ab52 --- /dev/null +++ b/egs/libri_css/s5_mono/local/make_voxceleb1.pl @@ -0,0 +1,130 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# 2018 David Snyder +# +# Usage: make_voxceleb1.pl /export/voxceleb1 data/ + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb1 data/\n"; + exit(1); +} + +($data_base, $out_dir) = @ARGV; +my $out_test_dir = "$out_dir/voxceleb1_test"; +my $out_train_dir = "$out_dir/voxceleb1_train"; + +if (system("mkdir -p $out_test_dir") != 0) { + die "Error making directory $out_test_dir"; +} + +if (system("mkdir -p $out_train_dir") != 0) { + die "Error making directory $out_train_dir"; +} + +opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +if (! -e "$data_base/voxceleb1_test.txt") { + system("wget -O $data_base/voxceleb1_test.txt http://www.openslr.org/resources/49/voxceleb1_test.txt"); +} + +if (! -e "$data_base/vox1_meta.csv") { + system("wget -O $data_base/vox1_meta.csv http://www.openslr.org/resources/49/vox1_meta.csv"); +} + +open(TRIAL_IN, "<", "$data_base/voxceleb1_test.txt") or die "Could not open the verification trials file $data_base/voxceleb1_test.txt"; +open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv"; +open(SPKR_TEST, ">", "$out_test_dir/utt2spk") or die "Could not open the output file $out_test_dir/utt2spk"; +open(WAV_TEST, ">", "$out_test_dir/wav.scp") or die "Could not open the output file $out_test_dir/wav.scp"; +open(SPKR_TRAIN, ">", "$out_train_dir/utt2spk") or die "Could not open the output file $out_train_dir/utt2spk"; +open(WAV_TRAIN, ">", "$out_train_dir/wav.scp") or die "Could not open the output file $out_train_dir/wav.scp"; +open(TRIAL_OUT, ">", "$out_test_dir/trials") or die "Could not open the output file $out_test_dir/trials"; + +my %id2spkr = (); +while () { + chomp; + my ($vox_id, $spkr_id, $gender, $nation, $set) = split; + $id2spkr{$vox_id} = $spkr_id; +} + +my $test_spkrs = (); +while () { + chomp; + my ($tar_or_non, $path1, $path2) = split; + + # Create entry for left-hand side of trial + my ($spkr_id, $filename) = split('/', $path1); + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $utt_id1 = "$spkr_id-$rec_id-$segment"; + $test_spkrs{$spkr_id} = (); + + # Create entry for right-hand side of trial + my ($spkr_id, $filename) = split('/', $path2); + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $utt_id2 = "$spkr_id-$rec_id-$segment"; + $test_spkrs{$spkr_id} = (); + + my $target = "nontarget"; + if ($tar_or_non eq "1") { + $target = "target"; + } + print TRIAL_OUT "$utt_id1 $utt_id2 $target\n"; +} + +foreach (@spkr_dirs) { + my $spkr_id = $_; + my $new_spkr_id = $spkr_id; + # If we're using a newer version of VoxCeleb1, we need to "deanonymize" + # the speaker labels. + if (exists $id2spkr{$spkr_id}) { + $new_spkr_id = $id2spkr{$spkr_id}; + } + opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@files) { + my $filename = $_; + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav"; + my $utt_id = "$new_spkr_id-$rec_id-$segment"; + if (exists $test_spkrs{$new_spkr_id}) { + print WAV_TEST "$utt_id", " $wav", "\n"; + print SPKR_TEST "$utt_id", " $new_spkr_id", "\n"; + } else { + print WAV_TRAIN "$utt_id", " $wav", "\n"; + print SPKR_TRAIN "$utt_id", " $new_spkr_id", "\n"; + } + } +} + +close(SPKR_TEST) or die; +close(WAV_TEST) or die; +close(SPKR_TRAIN) or die; +close(WAV_TRAIN) or die; +close(TRIAL_OUT) or die; +close(TRIAL_IN) or die; +close(META_IN) or die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_test_dir/utt2spk >$out_test_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_test_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_test_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_test_dir") != 0) { + die "Error validating directory $out_test_dir"; +} + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_train_dir/utt2spk >$out_train_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_train_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_train_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_train_dir") != 0) { + die "Error validating directory $out_train_dir"; +} diff --git a/egs/libri_css/s5_mono/local/make_voxceleb2.pl b/egs/libri_css/s5_mono/local/make_voxceleb2.pl new file mode 100755 index 00000000000..34c1591eba3 --- /dev/null +++ b/egs/libri_css/s5_mono/local/make_voxceleb2.pl @@ -0,0 +1,70 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# +# Usage: make_voxceleb2.pl /export/voxceleb2 dev data/dev +# +# Note: This script requires ffmpeg to be installed and its location included in $PATH. + +if (@ARGV != 3) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb2 dev data/dev\n"; + exit(1); +} + +# Check that ffmpeg is installed. +if (`which ffmpeg` eq "") { + die "Error: this script requires that ffmpeg is installed."; +} + +($data_base, $dataset, $out_dir) = @ARGV; + +if ("$dataset" ne "dev" && "$dataset" ne "test") { + die "dataset parameter must be 'dev' or 'test'!"; +} + +opendir my $dh, "$data_base/$dataset/aac" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$data_base/$dataset/aac/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +if (system("mkdir -p $out_dir") != 0) { + die "Error making directory $out_dir"; +} + +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; + +foreach (@spkr_dirs) { + my $spkr_id = $_; + + opendir my $dh, "$data_base/$dataset/aac/$spkr_id/" or die "Cannot open directory: $!"; + my @rec_dirs = grep {-d "$data_base/$dataset/aac/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh); + closedir $dh; + + foreach (@rec_dirs) { + my $rec_id = $_; + + opendir my $dh, "$data_base/$dataset/aac/$spkr_id/$rec_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.m4a$/} readdir($dh); + closedir $dh; + + foreach (@files) { + my $name = $_; + my $wav = "ffmpeg -v 8 -i $data_base/$dataset/aac/$spkr_id/$rec_id/$name.m4a -f wav -acodec pcm_s16le -|"; + my $utt_id = "$spkr_id-$rec_id-$name"; + print WAV "$utt_id", " $wav", "\n"; + print SPKR "$utt_id", " $spkr_id", "\n"; + } + } +} +close(SPKR) or die; +close(WAV) or die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/libri_css/s5_mono/local/multispeaker_score.sh b/egs/libri_css/s5_mono/local/multispeaker_score.sh new file mode 100755 index 00000000000..676c29f9192 --- /dev/null +++ b/egs/libri_css/s5_mono/local/multispeaker_score.sh @@ -0,0 +1,111 @@ +#!/usr/bin/env bash +# Copyright 2019 Ashish Arora, Yusuke Fujita +# 2020 Desh Raj +# Apache 2.0. +# This script takes a reference and hypothesis text file, and performs +# multispeaker scoring. + +stage=0 +datadir= +get_stats=false # TODO: Implement 'true' (i.e. per utterance alignment of output) +multistream=false # Set to true if input audio was separated (e.g. CSS) + +multistream_opt= +if [ $multistream == "true" ]; then + multistream_opt="--multi-stream" +fi + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/diarized/text data/dev \ + exp/chain_cleaned/tdnn_1d_sp/decode_dev_diarized/scoring_kaldi/penalty_1.0/10.txt \ + exp/chain_cleaned/tdnn_1d_sp/decode_dev_diarized/scoring_kaldi_multispeaker" + exit 1; +fi + +ref_file=$1 +hyp_file=$2 +out_dir=$3 + +output_dir=$out_dir/per_speaker_output +wer_dir=$out_dir/per_speaker_wer + +if [ $multistream ]; then + recording_ids=( $(awk '{$1=$1;sub(/_[0-9]*$/, "", $1); print $1}' data/$datadir/wav.scp | sort -u) ) +else + recording_ids=( $(awk '{print $1}' data/$datadir/wav.scp) ) +fi + +for f in $ref_file $hyp_file; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +if [ $stage -le 0 ]; then + # generate per speaker per recording files for reference and hypothesis + mkdir -p $output_dir $wer_dir + local/wer_output_filter < $ref_file > $output_dir/ref_filt.txt + local/wer_output_filter < $hyp_file > $output_dir/hyp_filt.txt + local/get_perspeaker_output.py --affix "ref" $output_dir/ref_filt.txt data/$datadir/utt2spk.bak $output_dir + local/get_perspeaker_output.py --affix "hyp" $multistream_opt $output_dir/hyp_filt.txt data/$datadir/utt2spk $output_dir +fi + +if [ $stage -le 1 ]; then + # Now for each recording, we score all pairs of ref/hyp speaker outputs + for reco_id in "${recording_ids[@]}"; do + # Get list of ref files + reco_ref_files=( $( ls $output_dir/ref* | grep $reco_id ) ) + # Get list of hyp files + reco_hyp_files=( $( ls $output_dir/hyp* | grep $reco_id ) ) + for reco_ref in "${reco_ref_files[@]}"; do + for reco_hyp in "${reco_hyp_files[@]}"; do + ref_spkid=$( basename "$reco_ref" | cut -d'_' -f2 ) + hyp_spkid=$( basename "$reco_hyp" | cut -d'_' -f2 ) + # compute WER with combined texts + compute-wer --text --mode=present ark:$reco_ref ark:$reco_hyp \ + > $wer_dir/wer_${reco_id}_r${ref_spkid}h${hyp_spkid} 2>/dev/null + done + done + done +fi + +if [ $stage -le 2 ]; then + for reco_id in "${recording_ids[@]}"; do + # For each recording, we create a summary file of all permutations + >$wer_dir/summary_$reco_id + reco_wer_files=( $( ls $wer_dir/wer_* | grep $reco_id ) ) + for reco_wer in "${reco_wer_files[@]}"; do + ref_hyp_spkid=$( basename "$reco_wer" | cut -d'_' -f5 ) + cur_wer=$( head -1 $reco_wer ) + printf "$ref_hyp_spkid %s\n" "${cur_wer}" >> $wer_dir/summary_$reco_id + done + + # Now we get best wer for each recording id + cat $wer_dir/summary_$reco_id \ + | local/best_wer_matching.py \ + > $wer_dir/best_wer_$reco_id + + done + rm $wer_dir/best_wer_all 2> /dev/null + awk ' + function basename(file, a, n) { + n = split(file, a, "/") + return a[n] + } + {printf "%s %s\n", basename(FILENAME), $0}' $wer_dir/best_wer_* > $wer_dir/best_wer_all +fi + +# Also compute the average WER stats over all conditions. This will be used +# for LMWT and WIP selection. +if [ $stage -le 3 ]; then + cat $wer_dir/best_wer_all | sed 's/,//g' | awk ' + { + ERR+=$5; WC+=$7; INS+=$8; DEL+=$10; SUB+=$12; + }END{ + WER=ERR*100/WC; + printf("%%WER %.2f [ %d / %d, %d ins, %d del, %d sub ]",WER,ERR,WC,INS,DEL,SUB); + } + ' > $wer_dir/best_wer_average +fi diff --git a/egs/libri_css/s5_mono/local/nnet3/decode.sh b/egs/libri_css/s5_mono/local/nnet3/decode.sh new file mode 100755 index 00000000000..795fec459b9 --- /dev/null +++ b/egs/libri_css/s5_mono/local/nnet3/decode.sh @@ -0,0 +1,163 @@ +#!/usr/bin/env bash + +# Copyright 2016 Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti) +# 2019 Vimal Manohar +# Apache 2.0. + +# This script does 2-stage decoding where the first stage is used to get +# reliable frames for i-vector extraction. + +set -e + +# general opts +iter= +stage=0 +nj=30 +affix= # affix for decode directory + +# ivector opts +max_count=75 # parameter for extract_ivectors.sh +sub_speaker_frames=6000 +ivector_scale=0.75 +get_weights_from_ctm=true +weights_file= # use weights from this archive (must be compressed using gunzip) +silence_weight=0.00001 # apply this weight to silence frames during i-vector extraction +ivector_dir=exp/nnet3 + +# decode opts +pass2_decode_opts="--min-active 1000" +lattice_beam=8 +extra_left_context=0 # change for (B)LSTM +extra_right_context=0 # change for BLSTM +frames_per_chunk=50 # change for (B)LSTM +acwt=0.1 # important to change this when using chain models +post_decode_acwt=1.0 # important to change this when using chain models +extra_left_context_initial=0 +extra_right_context_final=0 + +graph_affix= + +score_opts="--min-lmwt 6 --max-lmwt 13" + +. ./cmd.sh +[ -f ./path.sh ] && . ./path.sh +. utils/parse_options.sh || exit 1; + +if [ $# -ne 4 ]; then + echo "Usage: $0 [options] " + echo " Options:" + echo " --stage (0|1|2) # start scoring script from part-way through." + echo "e.g.:" + echo "$0 data/dev data/lang exp/tri5a/graph_pp exp/nnet3/tdnn" + exit 1; +fi + +data=$1 # data directory +lang=$2 # data/lang +graph=$3 #exp/tri5a/graph_pp +dir=$4 # exp/nnet3/tdnn + +model_affix=`basename $dir` +ivector_affix=${affix:+_$affix}_chain_${model_affix}${iter:+_iter$iter} +affix=${affix:+_${affix}}${iter:+_iter${iter}} + +if [ $stage -le 1 ]; then + if [ ! -s ${data}_hires/feats.scp ]; then + utils/copy_data_dir.sh $data ${data}_hires + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd "$train_cmd" ${data}_hires + steps/compute_cmvn_stats.sh ${data}_hires + fi +fi + +data_set=$(basename $data) +if [ $stage -le 2 ]; then + echo "Extracting i-vectors, stage 1" + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \ + --max-count $max_count \ + ${data}_hires $ivector_dir/extractor \ + $ivector_dir/ivectors_${data_set}${ivector_affix}_stage1; + # float comparisons are hard in bash + if [ `bc <<< "$ivector_scale != 1"` -eq 1 ]; then + ivector_scale_affix=_scale$ivector_scale + else + ivector_scale_affix= + fi + + if [ ! -z "$ivector_scale_affix" ]; then + echo "$0: Scaling iVectors, stage 1" + srcdir=$ivector_dir/ivectors_${data_set}${ivector_affix}_stage1 + outdir=$ivector_dir/ivectors_${data_set}${ivector_affix}${ivector_scale_affix}_stage1 + mkdir -p $outdir + $train_cmd $outdir/log/scale_ivectors.log \ + copy-matrix --scale=$ivector_scale scp:$srcdir/ivector_online.scp ark:- \| \ + copy-feats --compress=true ark:- ark,scp:$outdir/ivector_online.ark,$outdir/ivector_online.scp; + cp $srcdir/ivector_period $outdir/ivector_period + fi +fi + +decode_dir=$dir/decode${graph_affix}_${data_set}${affix} +# generate the lattices +if [ $stage -le 3 ]; then + echo "Generating lattices, stage 1" + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" \ + --acwt $acwt --post-decode-acwt $post_decode_acwt \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir $ivector_dir/ivectors_${data_set}${ivector_affix}${ivector_scale_affix}_stage1 \ + --skip-scoring true ${iter:+--iter $iter} \ + $graph ${data}_hires ${decode_dir}_stage1; +fi + +if [ $stage -le 4 ]; then + if $get_weights_from_ctm; then + if [ ! -z $weights_file ]; then + echo "$0: Using provided vad weights file $weights_file" + ivector_extractor_weights=$weights_file + else + echo "$0 : Generating vad weights file" + ivector_extractor_weights=${decode_dir}_stage1/weights${affix}.gz + local/extract_vad_weights.sh --silence-weight $silence_weight \ + --cmd "$decode_cmd" ${iter:+--iter $iter} \ + ${data}_hires $lang \ + ${decode_dir}_stage1 $ivector_extractor_weights + fi + else + # get weights from best path decoding + ivector_extractor_weights=${decode_dir}_stage1 + fi +fi + +if [ $stage -le 5 ]; then + echo "Extracting i-vectors, stage 2 with weights from $ivector_extractor_weights" + # this does offline decoding, except we estimate the iVectors per + # speaker, excluding silence (based on alignments from a DNN decoding), with a + # different script. This is just to demonstrate that script. + # the --sub-speaker-frames is optional; if provided, it will divide each speaker + # up into "sub-speakers" of at least that many frames... can be useful if + # acoustic conditions drift over time within the speaker's data. + steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \ + --silence-weight $silence_weight \ + --sub-speaker-frames $sub_speaker_frames --max-count $max_count \ + ${data}_hires $lang $ivector_dir/extractor \ + $ivector_extractor_weights $ivector_dir/ivectors_${data_set}${ivector_affix}; +fi + +if [ $stage -le 6 ]; then + echo "Generating lattices, stage 2 with --acwt $acwt" + rm -f ${decode_dir}/.error + steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" $pass2_decode_opts \ + --acwt $acwt --post-decode-acwt $post_decode_acwt \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk "$frames_per_chunk" \ + --skip-scoring false ${iter:+--iter $iter} --lattice-beam $lattice_beam \ + --online-ivector-dir $ivector_dir/ivectors_${data_set}${ivector_affix} \ + $graph ${data}_hires ${decode_dir} || touch ${decode_dir}/.error + [ -f ${decode_dir}/.error ] && echo "$0: Error decoding" && exit 1; +fi +exit 0 diff --git a/egs/libri_css/s5_mono/local/nnet3/run_ivector_common.sh b/egs/libri_css/s5_mono/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..26653ccbd5c --- /dev/null +++ b/egs/libri_css/s5_mono/local/nnet3/run_ivector_common.sh @@ -0,0 +1,149 @@ +#!/usr/bin/env bash + +set -e -o pipefail + + +# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually +# be called by more scripts). It contains the common feature preparation and iVector-related parts +# of the script. See those scripts for examples of usage. + + +stage=0 +train_set=train_960_cleaned # you might set this to e.g. train_960 +gmm=tri6b_cleaned # This specifies a GMM-dir from the features of the type you're training the system on; + # it should contain alignments for 'train_set'. +num_threads_ubm=16 +num_processes=4 +nnet3_affix=_cleaned # affix for exp/nnet3 directory to put iVector stuff in, so it + # becomes exp/nnet3_cleaned or whatever. + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp + +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + +if [ $stage -le 1 ]; then + #Although the nnet will be trained by high resolution data, we still have to + # perturb the normal data to get the alignment. _sp stands for speed-perturbed + echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp + echo "$0: making MFCC features for low-resolution speed-perturbed data" + steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 data/${train_set}_sp || exit 1; + steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1; + echo "$0: fixing input data-dir to remove nonexistent features, in case some " + echo ".. speed-perturbed segments were too short." + utils/fix_data_dir.sh data/${train_set}_sp +fi + +if [ $stage -le 2 ]; then + if [ -f $ali_dir/ali.1.gz ]; then + echo "$0: alignments in $ali_dir appear to already exist. Please either remove them " + echo " ... or use a later --stage option." + exit 1 + fi + echo "$0: aligning with the perturbed low-resolution data" + steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ + data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1 +fi + +if [ $stage -le 3 ]; then + # Create high-resolution MFCC features (with 40 cepstra instead of 13). + # this shows how you can split across multiple file-systems. we'll split the + # MFCC dir across multiple locations. You might want to be careful here, if you + # have multiple copies of Kaldi checked out and run the same recipe, not to let + # them overwrite each other. + echo "$0: creating high-resolution MFCC features" + mfccdir=data/${train_set}_sp_hires/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/mfcc/librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires + + for datadir in ${train_set}_sp; do + steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1; + utils/fix_data_dir.sh data/${datadir}_hires + done + + # now create a data subset. 60k is 1/5th of the training dataset (around 200 hours). + utils/subset_data_dir.sh data/${train_set}_sp_hires 60000 data/${train_set}_sp_hires_60k +fi + + +if [ $stage -le 4 ]; then + echo "$0: making a subset of data to train the diagonal UBM and the PCA transform." + # We'll one hundredth of the data, since Librispeech is very large. + mkdir -p exp/nnet3${nnet3_affix}/diag_ubm + temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm + + num_utts_total=$(wc -l " + echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features" + echo "Options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --norm-vars # If true, normalize variances in the sliding window cmvn" + exit 1; +fi + +data_in=$1 +data_out=$2 +dir=$3 + +name=`basename $data_in` + +for f in $data_in/feats.scp ; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +# Set various variables. +mkdir -p $dir/log +mkdir -p $data_out +featdir=$(utils/make_absolute.sh $dir) + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_cmvn_feats/storage $featdir/storage +fi + +for n in $(seq $nj); do + # the next command does nothing unless $featdir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $featdir/xvector_cmvn_feats_${name}.${n}.ark +done + +cp $data_in/utt2spk $data_out/utt2spk +cp $data_in/spk2utt $data_out/spk2utt +cp $data_in/wav.scp $data_out/wav.scp +for f in $data_in/segments $data_in/segments/vad.scp ; do + [ -f $f ] && cp $f $data_out/`basename $f`; +done + +write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB" + +sdata_in=$data_in/split$nj; +utils/split_data.sh $data_in $nj || exit 1; + +$cmd JOB=1:$nj $dir/log/create_xvector_cmvn_feats_${name}.JOB.log \ + apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \ + scp:${sdata_in}/JOB/feats.scp ark:- \| \ + copy-feats --compress=$compress $write_num_frames_opt ark:- \ + ark,scp:$featdir/xvector_cmvn_feats_${name}.JOB.ark,$featdir/xvector_cmvn_feats_${name}.JOB.scp || exit 1; + +for n in $(seq $nj); do + cat $featdir/xvector_cmvn_feats_${name}.$n.scp || exit 1; +done > ${data_out}/feats.scp || exit 1 + +for n in $(seq $nj); do + cat $featdir/log/utt2num_frames.$n || exit 1; +done > $data_out/utt2num_frames || exit 1 +rm $featdir/log/utt2num_frames.* + +echo "$0: Succeeded creating xvector features for $name" diff --git a/egs/libri_css/s5_mono/local/nnet3/xvector/prepare_feats_for_egs.sh b/egs/libri_css/s5_mono/local/nnet3/xvector/prepare_feats_for_egs.sh new file mode 100755 index 00000000000..326b6dbb9fa --- /dev/null +++ b/egs/libri_css/s5_mono/local/nnet3/xvector/prepare_feats_for_egs.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +# +# Apache 2.0. + +# This script applies sliding window CMVN and removes silence frames. This +# is performed on the raw features prior to generating examples for training +# the x-vector system. Once the training examples are generated, the features +# created by this script can be removed. + +nj=40 +cmd="run.pl" +stage=0 +norm_vars=false +center=true +compress=true +cmn_window=300 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features" + echo "Options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --norm-vars # If true, normalize variances in the sliding window cmvn" + exit 1; +fi + +data_in=$1 +data_out=$2 +dir=$3 + +name=`basename $data_in` + +for f in $data_in/feats.scp $data_in/vad.scp ; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +# Set various variables. +mkdir -p $dir/log +mkdir -p $data_out +featdir=$(utils/make_absolute.sh $dir) + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_feats/storage $featdir/storage +fi + +for n in $(seq $nj); do + # the next command does nothing unless $featdir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $featdir/xvector_feats_${name}.${n}.ark +done + +cp $data_in/utt2spk $data_out/utt2spk +cp $data_in/spk2utt $data_out/spk2utt +cp $data_in/wav.scp $data_out/wav.scp + +write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB" + +sdata_in=$data_in/split$nj; +utils/split_data.sh $data_in $nj || exit 1; + +$cmd JOB=1:$nj $dir/log/create_xvector_feats_${name}.JOB.log \ + apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \ + scp:${sdata_in}/JOB/feats.scp ark:- \| \ + select-voiced-frames ark:- scp,s,cs:${sdata_in}/JOB/vad.scp ark:- \| \ + copy-feats --compress=$compress $write_num_frames_opt ark:- \ + ark,scp:$featdir/xvector_feats_${name}.JOB.ark,$featdir/xvector_feats_${name}.JOB.scp || exit 1; + +for n in $(seq $nj); do + cat $featdir/xvector_feats_${name}.$n.scp || exit 1; +done > ${data_out}/feats.scp || exit 1 + +for n in $(seq $nj); do + cat $featdir/log/utt2num_frames.$n || exit 1; +done > $data_out/utt2num_frames || exit 1 +rm $featdir/log/utt2num_frames.* + +echo "$0: Succeeded creating xvector features for $name" diff --git a/egs/libri_css/s5_mono/local/nnet3/xvector/run_xvector.sh b/egs/libri_css/s5_mono/local/nnet3/xvector/run_xvector.sh new file mode 120000 index 00000000000..585b63fd2dd --- /dev/null +++ b/egs/libri_css/s5_mono/local/nnet3/xvector/run_xvector.sh @@ -0,0 +1 @@ +tuning/run_xvector_1a.sh \ No newline at end of file diff --git a/egs/libri_css/s5_mono/local/nnet3/xvector/tuning/run_xvector_1a.sh b/egs/libri_css/s5_mono/local/nnet3/xvector/tuning/run_xvector_1a.sh new file mode 100755 index 00000000000..2189e406a7e --- /dev/null +++ b/egs/libri_css/s5_mono/local/nnet3/xvector/tuning/run_xvector_1a.sh @@ -0,0 +1,149 @@ +#!/usr/bin/env bash +# Copyright 2018 David Snyder +# 2018 Johns Hopkins University (Author: Daniel Garcia-Romero) +# 2018 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + +# This script trains the x-vector DNN. The recipe is similar to the one +# described in "Diarization is Hard: Some Experiences and Lessons Learned +# for the JHU Team in the Inaugural DIHARD Challenge" by Sell et al. + +. ./cmd.sh +set -e + +stage=1 +train_stage=-1 +use_gpu=true +remove_egs=false + +data=data/train +nnet_dir=exp/xvector_nnet_1a/ +egs_dir=exp/xvector_nnet_1a/egs + +. ./path.sh +. ./cmd.sh +. ./utils/parse_options.sh + +num_pdfs=$(awk '{print $2}' $data/utt2spk | sort | uniq -c | wc -l) + +# Now we create the nnet examples using sid/nnet3/xvector/get_egs.sh. +# The argument --num-repeats is related to the number of times a speaker +# repeats per archive. If it seems like you're getting too many archives +# (e.g., more than 200) try increasing the --frames-per-iter option. The +# arguments --min-frames-per-chunk and --max-frames-per-chunk specify the +# minimum and maximum length (in terms of number of frames) of the features +# in the examples. +# +# To make sense of the egs script, it may be necessary to put an "exit 1" +# command immediately after stage 3. Then, inspect +# exp//egs/temp/ranges.* . The ranges files specify the examples that +# will be created, and which archives they will be stored in. Each line of +# ranges.* has the following form: +# +# For example: +# 100304-f-sre2006-kacg-A 1 2 4079 881 23 + +# If you're satisfied with the number of archives (e.g., 50-150 archives is +# reasonable) and with the number of examples per speaker (e.g., 1000-5000 +# is reasonable) then you can let the script continue to the later stages. +# Otherwise, try increasing or decreasing the --num-repeats option. You might +# need to fiddle with --frames-per-iter. Increasing this value decreases the +# the number of archives and increases the number of examples per archive. +# Decreasing this value increases the number of archives, while decreasing the +# number of examples per archive. +if [ $stage -le 6 ]; then + echo "$0: Getting neural network training egs"; + # dump egs. + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b{03,04,05,06}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/$egs_dir/storage $egs_dir/storage + fi + sid/nnet3/xvector/get_egs.sh --cmd "$train_cmd" \ + --nj 8 \ + --stage 0 \ + --frames-per-iter 1000000000 \ + --frames-per-iter-diagnostic 500000 \ + --min-frames-per-chunk 200 \ + --max-frames-per-chunk 400 \ + --num-diagnostic-archives 3 \ + --num-repeats 40 \ + "$data" $egs_dir +fi + +if [ $stage -le 7 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(wc -w $egs_dir/pdf2num | awk '{print $1}') + feat_dim=$(cat $egs_dir/info/feat_dim) + + # This chunk-size corresponds to the maximum number of frames the + # stats layer is able to pool over. In this script, it corresponds + # to 4 seconds. If the input recording is greater than 4 seconds, + # we will compute multiple xvectors from the same recording and average + # to produce the final xvector. + max_chunk_size=400 + + # The smallest number of frames we're comfortable computing an xvector from. + # Note that the hard minimum is given by the left and right context of the + # frame-level layers. + min_chunk_size=20 + mkdir -p $nnet_dir/configs + cat < $nnet_dir/configs/network.xconfig + # please note that it is important to have input layer with the name=input + + # The frame-level layers + input dim=${feat_dim} name=input + relu-batchnorm-layer name=tdnn1 input=Append(-2,-1,0,1,2) dim=512 + relu-batchnorm-layer name=tdnn2 input=Append(-2,0,2) dim=512 + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn4 dim=512 + relu-batchnorm-layer name=tdnn5 dim=1500 + + # The stats pooling layer. Layers after this are segment-level. + # In the config below, the first and last argument (0, and ${max_chunk_size}) + # means that we pool over an input segment starting at frame 0 + # and ending at frame ${max_chunk_size} or earlier. The other arguments (1:1) + # mean that no subsampling is performed. + stats-layer name=stats config=mean+stddev(0:1:1:${max_chunk_size}) + + # This is where we usually extract the embedding (aka xvector) from. + relu-batchnorm-layer name=tdnn6 dim=128 input=stats + output-layer name=output include-log-softmax=true dim=${num_targets} +EOF + + steps/nnet3/xconfig_to_configs.py \ + --xconfig-file $nnet_dir/configs/network.xconfig \ + --config-dir $nnet_dir/configs/ + cp $nnet_dir/configs/final.config $nnet_dir/nnet.config + + # These three files will be used by sid/nnet3/xvector/extract_xvectors.sh + echo "output-node name=output input=tdnn6.affine" > $nnet_dir/extract.config + echo "$max_chunk_size" > $nnet_dir/max_chunk_size + echo "$min_chunk_size" > $nnet_dir/min_chunk_size +fi + +dropout_schedule='0,0@0.20,0.1@0.50,0' +srand=123 +if [ $stage -le 8 ]; then + steps/nnet3/train_raw_dnn.py --stage=$train_stage \ + --cmd="$train_cmd" \ + --trainer.optimization.proportional-shrink 10 \ + --trainer.optimization.momentum=0.5 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=8 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.minibatch-size=64 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2 \ + --trainer.num-epochs=3 \ + --trainer.dropout-schedule="$dropout_schedule" \ + --trainer.shuffle-buffer-size=1000 \ + --egs.frames-per-eg=1 \ + --egs.dir="$egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval=10 \ + --use-gpu=true \ + --dir=$nnet_dir || exit 1; +fi + +exit 0; diff --git a/egs/libri_css/s5_mono/local/prepare_data.py b/egs/libri_css/s5_mono/local/prepare_data.py new file mode 100755 index 00000000000..3d5b622ab30 --- /dev/null +++ b/egs/libri_css/s5_mono/local/prepare_data.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Copyright 2020 Johns Hopkins University (Author: Desh Raj) +# Apache 2.0 + +import argparse, os, glob, tqdm, zipfile +import subprocess + +def write_dict_to_file(utt2data, file_path): + f = open(file_path, 'w') + for utt in utt2data.keys(): + f.write('{} {}\n'.format(utt, utt2data[utt])) + f.close() + return + +def main(args): + os.makedirs(args.tgtpath, exist_ok=True) + + # Dictionary to store all info that we will write to files after + # reading all files. + reco2wav = {} # for wav.scp + reco2segments = {} # for segments + utt2spk = {} # for utt2spk + utt2text = {} # for text + print ("Creating dictionary of all clean LibriSpeech utterances") + if (args.cleanpath): + utt2clean = {} # path to clean utt wav file + command = 'find %s -name "*.flac"' % (args.cleanpath) + wavs = subprocess.check_output(command, shell=True).decode('utf-8').splitlines() + keys = [ os.path.splitext(os.path.basename(wav))[0] for wav in wavs ] + clean_paths = {key:wav for key,wav in zip(keys,wavs)} + + # Create a directory to store channel-separated wav files + wav_dir = os.path.join(args.tgtpath,'wavs') + os.makedirs(wav_dir, exist_ok=True) + + conditions = ('0L','0S','OV10','OV20','OV30','OV40') + for cond in tqdm.tqdm(conditions): + meeting = glob.glob(os.path.join(args.srcpath, cond, 'overlap*')) + for meet in meeting: + # Extract the signals of the selected microphones. + meeting_name = os.path.basename(meet) + _,_,_,_,_,sessid,olr = meeting_name.split('_') + + wav_path = os.path.join(os.path.abspath(meet), 'record', 'raw_recording.wav') + for mic in args.mics: + reco_id = "{}_CH{}_{}".format(sessid, mic, cond) # Session0_CH1_0L + new_wav_path = "sox {} -t wav - remix {} |".format(wav_path, mic+1) # channel will be extracted on the fly + reco2wav[reco_id] = new_wav_path + + segments = [] + with open(os.path.join(os.path.abspath(meet), 'transcription', 'meeting_info.txt'), 'r') as f: + next(f) + for line in f: + start,end,spkid,clean_uttid,text = line.strip().split(maxsplit=4) + start = float("{:.2f}".format(float(start))) + end = float("{:.2f}".format(float(end))) + utt_id = "{}_{}_{}_{}".format(spkid,reco_id,"{:.0f}".format(100*start).zfill(6), + "{:.0f}".format(100*end).zfill(6)) # 6930_Session0_CH1_0L_000853_002463 + utt2spk[utt_id] = spkid + utt2text[utt_id] = text + segments.append((utt_id, start, end)) + if args.cleanpath: + utt2clean[utt_id] = "sox {} -t wav - |".format(clean_paths[clean_uttid]) + + reco2segments[reco_id] = segments + + # Write all dictionaries to respective files + write_dict_to_file(reco2wav, os.path.join(args.tgtpath, 'wav.scp')) + write_dict_to_file(utt2spk, os.path.join(args.tgtpath, 'utt2spk')) + write_dict_to_file(utt2text, os.path.join(args.tgtpath, 'text')) + write_dict_to_file(utt2clean, os.path.join(args.tgtpath, "wav_clean.scp")) + + f = open(os.path.join(args.tgtpath, 'segments'), 'w') + for reco in reco2segments.keys(): + segments = reco2segments[reco] + for segment in segments: + f.write('{} {} {} {}\n'.format(segment[0], reco, segment[1], segment[2])) + f.close() + + + +def make_argparse(): + parser = argparse.ArgumentParser(description='Reorganize LibriCSS data into Kaldi format.') + + parser = argparse.ArgumentParser() + parser.add_argument('--srcpath', metavar='', required=True, + help='Original LibriCSS data path.') + parser.add_argument('--tgtpath', metavar='', required=True, + help='Destination path.') + parser.add_argument('--mics', type=int, metavar='<#mics>', nargs='+', default=[0, 1, 2, 3, 4, 5, 6], + help='Microphone indices.') + parser.add_argument('--cleanpath', metavar='', required=False, + help='Path to clean Librispeech data (required for wav_clean.scp)') + + return parser + + + +if __name__ == '__main__': + parser = make_argparse() + args = parser.parse_args() + main(args) diff --git a/egs/libri_css/s5_mono/local/prepare_data_css.py b/egs/libri_css/s5_mono/local/prepare_data_css.py new file mode 100755 index 00000000000..7a87e80e30d --- /dev/null +++ b/egs/libri_css/s5_mono/local/prepare_data_css.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Copyright 2020 Johns Hopkins University (Author: Desh Raj) +# Apache 2.0 + +import argparse, os, glob, tqdm, zipfile, pathlib + +def write_dict_to_file(utt2data, file_path): + f = open(file_path, 'w') + for utt in utt2data.keys(): + f.write('{} {}\n'.format(utt, utt2data[utt])) + f.close() + return + +def main(args): + os.makedirs(args.tgtpath, exist_ok=True) + + # Dictionary to store all info that we will write to files after + # reading all files. + reco2wav = {} # for wav.scp + reco2segments = {} # for segments + utt2spk = {} # for utt2spk + utt2text = {} # for text + + # First we create reco2wav from the separated wav files + wavs = os.listdir(args.wav_path) + for wav in wavs: + path = os.path.join(args.wav_path, wav) + _,_,olr,_,sil_max,sessid,_,_,stream = pathlib.Path(path).stem.split('_') + cond = "OV{}".format(int(float(olr))) + if (float(olr) == 0): + if (sil_max == '0.5'): + cond = "0S" + else: + cond = "0L" + wav_name = "{}_CH0_{}_{}".format(sessid, cond, stream) # session0_CH0_0L_1 + reco2wav[wav_name] = path + if (args.volume != 1): + reco2wav[wav_name] = "sox -v {} -t wav {} -t wav - |".format(args.volume, path) + + + # Now we get other info from the original LibriCSS corpus dir + conditions = ('0L','0S','OV10','OV20','OV30','OV40') + for cond in tqdm.tqdm(conditions): + meeting = glob.glob(os.path.join(args.srcpath, cond, 'overlap*')) + for meet in meeting: + segments = [] + _,_,_,_,_,sessid,_ = os.path.basename(meet).split('_') + reco_id = "{}_CH0_{}".format(sessid, cond) # session0_CH0_0L + with open(os.path.join(os.path.abspath(meet), 'transcription', 'meeting_info.txt'), 'r') as f: + next(f) + for line in f: + start,end,spkid,_,text = line.strip().split(maxsplit=4) + start = float("{:.2f}".format(float(start))) + end = float("{:.2f}".format(float(end))) + utt_id = "{}_{}_{}_{}".format(spkid,reco_id,"{:.0f}".format(100*start).zfill(6), + "{:.0f}".format(100*end).zfill(6)) # 6930_Session0_CH1_0L_000853_002463 + utt2spk[utt_id] = spkid + utt2text[utt_id] = text + segments.append((utt_id, start, end)) + + reco2segments[reco_id] = segments + + # Write all dictionaries to respective files + write_dict_to_file(reco2wav, os.path.join(args.tgtpath, 'wav.scp')) + write_dict_to_file(utt2spk, os.path.join(args.tgtpath, 'utt2spk')) + write_dict_to_file(utt2text, os.path.join(args.tgtpath, 'text')) + + f = open(os.path.join(args.tgtpath, 'segments'), 'w') + for reco in reco2segments.keys(): + segments = reco2segments[reco] + for segment in segments: + f.write('{} {} {} {}\n'.format(segment[0], reco, segment[1], segment[2])) + f.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Reorganize LibriCSS data into Kaldi format.' + ' Additionally, use separated wav files.') + + parser = argparse.ArgumentParser() + parser.add_argument('--srcpath', metavar='', required=True, + help='Original LibriCSS data path.') + parser.add_argument('--wav-path', metavar='', required=True, + help='Path to directory containing separated wavs.') + parser.add_argument('--tgtpath', metavar='', required=True, + help='Destination path.') + parser.add_argument('--volume', default=1, type=float, help='sox -v option') + + args = parser.parse_args() + main(args) diff --git a/egs/libri_css/s5_mono/local/prepare_dict.sh b/egs/libri_css/s5_mono/local/prepare_dict.sh new file mode 100755 index 00000000000..7b345b6bf1c --- /dev/null +++ b/egs/libri_css/s5_mono/local/prepare_dict.sh @@ -0,0 +1,143 @@ +#!/usr/bin/env bash + +# Copyright 2014 Vassil Panayotov +# Apache 2.0 + +# Prepares the dictionary and auto-generates the pronunciations for the words, +# that are in our vocabulary but not in CMUdict + +stage=0 +nj=4 # number of parallel Sequitur G2P jobs, we would like to use +cmd=run.pl + + +. utils/parse_options.sh || exit 1; +. ./path.sh || exit 1 + + +if [ $# -ne 3 ]; then + echo "Usage: $0 [options] " + echo "e.g.: /export/a15/vpanayotov/data/lm /export/a15/vpanayotov/data/g2p data/local/dict" + echo "Options:" + echo " --cmd '' # script to launch jobs with, default: run.pl" + echo " --nj # number of jobs to run, default: 4." + exit 1 +fi + +lm_dir=$1 +g2p_model_dir=$2 +dst_dir=$3 + +vocab=$lm_dir/librispeech-vocab.txt +[ ! -f $vocab ] && echo "$0: vocabulary file not found at $vocab" && exit 1; + +# this file is either a copy of the lexicon we download from openslr.org/11 or is +# created by the G2P steps below +lexicon_raw_nosil=$dst_dir/lexicon_raw_nosil.txt + +cmudict_dir=$dst_dir/cmudict +cmudict_plain=$dst_dir/cmudict.0.7a.plain + +mkdir -p $dst_dir || exit 1; + +if [ $stage -le 0 ]; then + echo "Downloading and preparing CMUdict" + if [ ! -s $cmudict_dir/cmudict.0.7a ]; then + svn co -r 12440 https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $cmudict_dir || exit 1; + fi + echo "Removing the pronunciation variant markers ..." + grep -v ';;;' $cmudict_dir/cmudict.0.7a | \ + perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \ + > $cmudict_plain || exit 1; +fi + + +if [ $stage -le 1 ]; then + # check if we have Sequitur G2P is installed + if [ ! -f "$sequitur" ]; then + if ! which swig >&/dev/null; then + echo "Please install 'swig' and then run $KALDI_ROOT/tools/extra/install_sequitur.sh" + exit 1 + else + echo "Sequitur G2P not found- running $KALDI_ROOT/tools/extra/install_sequitur.sh" + pushd $KALDI_ROOT/tools + extras/install_sequitur.sh || exit 1 + popd + fi + fi + [[ -f "$sequitur" ]] || { echo "Still can't find Sequitur G2P- check your path.sh"; exit 1; } + + g2p_dir=$dst_dir/g2p + auto_vocab_prefix="$g2p_dir/vocab_autogen" + auto_lexicon_prefix="$g2p_dir/lexicon_autogen" + + mkdir -p $g2p_dir/log + auto_vocab_splits=$(eval "echo $auto_vocab_prefix.{$(seq -s',' $nj | sed 's/,$//')}") + awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $cmudict_plain $vocab |\ + sort | tee $g2p_dir/vocab_autogen.full |\ + utils/split_scp.pl /dev/stdin $auto_vocab_splits || exit 1 + echo "Autogenerating pronunciations for the words in $auto_vocab_prefix.* ..." + $cmd JOB=1:$nj $g2p_dir/log/g2p.JOB.log \ + local/g2p.sh $auto_vocab_prefix.JOB $g2p_model_dir $auto_lexicon_prefix.JOB || exit 1 + g2p_vocab_size=$(wc -l <$g2p_dir/vocab_autogen.full) + g2p_lex_size=$(wc -l < <(cat $auto_lexicon_prefix.*)) + [[ "$g2p_vocab_size" -eq "$g2p_lex_size" ]] || { echo "Unexpected G2P error"; exit 1; } + sort <(cat $auto_vocab_prefix.*) >$dst_dir/vocab_autogen.txt + sort <(cat $auto_lexicon_prefix.*) >$dst_dir/lexicon_autogen.txt + echo "$(wc -l <$g2p_dir/vocab_autogen.full) pronunciations autogenerated OK" +fi + +if [ $stage -le 2 ]; then + echo "Combining the CMUdict pronunciations with the autogenerated ones ..." + awk 'NR==FNR{a[$1]=1; next} ($1 in a)' $vocab $cmudict_plain |\ + cat - $dst_dir/lexicon_autogen.txt | sort >$lexicon_raw_nosil || exit 1 + raw_lex_size=$(cat $lexicon_raw_nosil | awk '{print $1}' | sort -u | wc -l) + vocab_size=$(wc -l <$vocab) + [[ "$vocab_size" -eq "$raw_lex_size" ]] || { + echo "Inconsistent lexicon($raw_lex_size) vs vocabulary($vocab_size) size!"; + exit 1; } + echo "Combined lexicon saved to '$lexicon_raw_nosil'" +fi + +# The copy operation below is necessary, if we skip the g2p stages(e.g. using --stage 3) +if [[ ! -s "$lexicon_raw_nosil" ]]; then + cp $lm_dir/librispeech-lexicon.txt $lexicon_raw_nosil || exit 1 +fi + +if [ $stage -le 3 ]; then + silence_phones=$dst_dir/silence_phones.txt + optional_silence=$dst_dir/optional_silence.txt + nonsil_phones=$dst_dir/nonsilence_phones.txt + extra_questions=$dst_dir/extra_questions.txt + + echo "Preparing phone lists and clustering questions" + (echo SIL; echo SPN;) > $silence_phones + echo SIL > $optional_silence + # nonsilence phones; on each line is a list of phones that correspond + # really to the same base phone. + awk '{for (i=2; i<=NF; ++i) { print $i; gsub(/[0-9]/, "", $i); print $i}}' $lexicon_raw_nosil |\ + sort -u |\ + perl -e 'while(<>){ + chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; + $phones_of{$1} .= "$_ "; } + foreach $list (values %phones_of) {print $list . "\n"; } ' | sort \ + > $nonsil_phones || exit 1; + # A few extra questions that will be added to those obtained by automatically clustering + # the "real" phones. These ask about stress; there's also one for silence. + cat $silence_phones| awk '{printf("%s ", $1);} END{printf "\n";}' > $extra_questions || exit 1; + cat $nonsil_phones | perl -e 'while(<>){ foreach $p (split(" ", $_)) { + $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \ + >> $extra_questions || exit 1; + echo "$(wc -l <$silence_phones) silence phones saved to: $silence_phones" + echo "$(wc -l <$optional_silence) optional silence saved to: $optional_silence" + echo "$(wc -l <$nonsil_phones) non-silence phones saved to: $nonsil_phones" + echo "$(wc -l <$extra_questions) extra triphone clustering-related questions saved to: $extra_questions" +fi + +if [ $stage -le 4 ]; then + (echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) |\ + cat - $lexicon_raw_nosil | sort | uniq >$dst_dir/lexicon.txt + echo "Lexicon text file saved as: $dst_dir/lexicon.txt" +fi + +exit 0 diff --git a/egs/libri_css/s5_mono/local/rnnlm/train.sh b/egs/libri_css/s5_mono/local/rnnlm/train.sh new file mode 120000 index 00000000000..8e647598556 --- /dev/null +++ b/egs/libri_css/s5_mono/local/rnnlm/train.sh @@ -0,0 +1 @@ +tuning/run_tdnn_lstm_1a.sh \ No newline at end of file diff --git a/egs/libri_css/s5_mono/local/rnnlm/tuning/run_tdnn_lstm_1a.sh b/egs/libri_css/s5_mono/local/rnnlm/tuning/run_tdnn_lstm_1a.sh new file mode 100755 index 00000000000..0fcf4c354b1 --- /dev/null +++ b/egs/libri_css/s5_mono/local/rnnlm/tuning/run_tdnn_lstm_1a.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash + +# Copyright 2012 Johns Hopkins University (author: Daniel Povey) +# 2018 Ke Li + +# This script trains LMs on the librispeech-lm-norm.txt.gz. + +# rnnlm/train_rnnlm.sh: best iteration (out of 143) was 142, linking it to final iteration. +# rnnlm/train_rnnlm.sh: train/dev perplexity was 109.2 / 110.7. +# Train objf: -5.74 -5.54 -5.44 -5.37 -5.32 -5.28 -5.25 -5.23 -5.20 -5.18 -5.15 -5.14 -5.12 -5.10 -5.09 -5.08 -5.07 -5.05 -5.04 -5.04 -5.03 -5.02 -5.01 -5.00 -4.99 -4.99 -4.98 -4.97 -4.96 -4.96 -4.95 -4.95 -4.94 -4.93 -4.93 -4.92 -4.92 -4.92 -4.91 -4.90 -4.90 -4.89 -4.89 -4.89 -4.88 -4.88 -4.87 -4.87 -4.87 -4.86 -4.86 -4.86 -4.85 -4.85 -4.84 -4.84 -4.84 -4.84 -4.84 -4.83 -4.83 -4.83 -4.82 -4.82 -4.82 -4.82 -4.81 -4.81 -4.81 -4.81 -4.80 -4.80 -4.80 -4.79 -4.79 -4.79 -4.79 -4.78 -4.79 -4.78 -4.78 -4.78 -4.78 -4.77 -4.77 -4.77 -4.77 -4.77 -4.76 -4.76 -4.76 -4.76 -4.76 -4.75 -4.75 -4.75 -4.75 -4.75 -4.74 -4.74 -4.74 -4.74 -4.74 -4.74 -4.73 -4.74 -4.74 -4.73 -4.73 -4.73 -4.73 -4.73 -4.72 -4.73 -4.73 -4.73 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.71 -4.71 -4.71 -4.71 -4.71 -4.70 -4.70 -4.70 -4.70 -4.70 -4.69 -4.69 -4.69 -4.69 -4.69 -4.69 -4.68 -4.68 +# Dev objf: -5.99 -5.65 -5.53 -5.44 -5.38 -5.34 -5.30 -5.27 -5.22 -5.20 -5.18 -5.16 -5.14 -5.12 -5.11 -5.10 -5.09 -5.08 -5.07 -5.05 -5.04 -5.04 -5.03 -5.01 -5.00 -4.99 -4.99 -4.98 -4.97 -4.97 0.00 -4.96 -4.95 -4.95 -4.94 -4.93 -4.93 -4.92 -4.92 -4.91 -4.91 -4.90 -4.90 -4.89 -4.89 -4.89 -4.88 -4.88 -4.88 -4.87 -4.87 -4.87 -4.86 -4.86 -4.85 -4.85 -4.87 -4.84 -4.84 -4.84 -4.83 -4.91 -4.83 -4.83 -4.83 -4.82 -4.82 -4.82 -4.82 -4.81 -4.81 -4.81 -4.80 -4.80 -4.80 -4.80 -4.80 -4.79 -4.79 -4.79 -4.79 -4.79 -4.79 -4.78 -4.78 -4.79 -4.78 -4.77 -4.77 -4.77 -4.77 -4.77 -4.77 -4.77 -4.76 -4.76 -4.76 -4.76 -4.76 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.74 -4.74 -4.74 -4.74 -4.74 -4.74 -4.74 -4.73 -4.74 -4.73 -4.73 -4.73 -4.73 -4.73 -4.73 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.71 -4.71 -4.71 -4.71 -4.71 -4.71 -4.71 -4.71 + +# WER summary on dev and test sets +# System tdnn_1d_sp +lattice_rescore +nbest_rescore +# WER on dev(fglarge) 3.34 2.71 2.62 +# WER on dev(tglarge) 3.44 2.75 2.66 +# WER on dev_other(fglarge) 8.70 7.37 7.55 +# WER on dev_other(tglarge) 9.25 7.56 7.73 +# WER on test(fglarge) 3.77 3.12 3.06 +# WER on test(tglarge) 3.85 3.18 3.11 +# WER on test_other(fglarge) 8.91 7.63 7.68 +# WER on test_other(tglarge) 9.31 7.83 7.95 + +# command to get the WERs above: +# tdnn_1d_sp +# for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge; do grep WER exp/chain_cleaned/tdnn_1d_sp/decode_${test}_${lm}/wer* | best_wer.sh; done; done +# tdnn_1d_sp with lattice rescoring +# for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge; do grep WER exp/chain_cleaned/tdnn_1d_sp/decode_${test}_${lm}_rnnlm_1a_rescore/wer* | best_wer.sh; done; done +# tdnn_1d_sp with nbest rescoring +# for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge; do grep WER exp/chain_cleaned/tdnn_1d_sp/decode_${test}_${lm}_rnnlm_1a_nbest_rescore/wer* | best_wer.sh; done; done + +# Begin configuration section. + +dir=exp/rnnlm_lstm_1a +embedding_dim=1024 +lstm_rpd=256 +lstm_nrpd=256 +stage=-10 +train_stage=-10 +epochs=4 + +# variables for lattice rescoring +run_lat_rescore=true +run_nbest_rescore=true +run_backward_rnnlm=false +ac_model_dir=exp/chain_cleaned/tdnn_1d_sp +decode_dir_suffix=rnnlm_1a +ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order + # if it's set, it merges histories in the lattice if they share + # the same ngram history and this prevents the lattice from + # exploding exponentially +pruned_rescore=true + +. ./cmd.sh +. ./utils/parse_options.sh + +text=data/local/lm/librispeech-lm-norm.txt.gz +lexicon=data/lang_nosp/words.txt +text_dir=data/rnnlm/text +mkdir -p $dir/config +set -e + +for f in $lexicon; do + [ ! -f $f ] && \ + echo "$0: expected file $f to exist; search for run.sh in run.sh" && exit 1 +done + +if [ $stage -le 0 ]; then + mkdir -p $text_dir + if [ ! -f $text ]; then + wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/lm + fi + echo -n >$text_dir/dev.txt + # hold out one in every 2000 lines as dev data. + gunzip -c $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%2000 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/librispeech.txt +fi + +if [ $stage -le 1 ]; then + cp $lexicon $dir/config/ + n=`cat $dir/config/words.txt | wc -l` + echo " $n" >> $dir/config/words.txt + + # words that are not present in words.txt but are in the training or dev data, will be + # mapped to during training. + echo "" >$dir/config/oov.txt + + cat > $dir/config/data_weights.txt <$dir/config/unigram_probs.txt + + # choose features + rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ + --top-word-features=5000 \ + --use-constant-feature=true \ + --special-words=',,,,' \ + $dir/config/words.txt > $dir/config/features.txt + + cat >$dir/config/xconfig < # dev set decoding directory" + echo " --eval_decodedir # eval set decoding directory" + echo " --dev_datadir # dev set data directory" + echo " --eval_datadir # eval set data directory" + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + echo " --multistream # set to true if scoring multistream audio" + + exit 1; +fi + +mkdir -p $dev_decodedir/scoring_kaldi_multispeaker + +if [ $stage -le 1 ]; then + # obtaining multi speaker WER for all lmwt and wip + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + $cmd LMWT=$min_lmwt:$max_lmwt \ + $dev_decodedir/scoring_kaldi_multispeaker/multispeaker_score.LMWT.log \ + local/multispeaker_score.sh --multistream $multistream \ + --datadir $dev_datadir --get_stats false data/$dev_datadir/text \ + $dev_decodedir/scoring_kaldi/penalty_$wip/$LMWT.txt \ + $dev_decodedir/scoring_kaldi_multispeaker/penalty_$wip/$LMWT + done + done +fi + +if [ $stage -le 2 ]; then + # obtaining best lmwt, wip and wer + echo "Selecting best LM weight and WIP for condition $cond" + grep WER $dev_decodedir/scoring_kaldi_multispeaker/penalty_*/*/per_speaker_wer/best_wer_average | \ + utils/best_wer.sh >& $dev_decodedir/scoring_kaldi_multispeaker/best_wer_average + + best_wer_file=$(awk '{print $NF}' $dev_decodedir/scoring_kaldi_multispeaker/best_wer_average) + best_lmwt=$(echo $best_wer_file | cut -d'/' -f7) + best_wip=$(echo $best_wer_file | cut -d'/' -f6 | cut -d'_' -f2) + + # printing and storing best lmwt, best_array and wip + echo "best LM weight: $best_lmwt" + echo "best insertion penalty weight: $best_wip" + + echo $best_lmwt > $dev_decodedir/scoring_kaldi_multispeaker/lmwt + echo $best_wip > $dev_decodedir/scoring_kaldi_multispeaker/wip +fi + +if [ $stage -le 3 ]; then + # Get WER for all conditions for the selected LMWT and WIP and remove other files + best_lmwt="$(cat $dev_decodedir/scoring_kaldi_multispeaker/lmwt)" + best_wip="$(cat $dev_decodedir/scoring_kaldi_multispeaker/wip)" + cat $dev_decodedir/scoring_kaldi_multispeaker/penalty_$best_wip/$best_lmwt/per_speaker_wer/best_wer_all \ + > $dev_decodedir/scoring_kaldi_multispeaker/best_wer + echo "Cleaning up WER files..." + find $dev_decodedir/scoring_kaldi_multispeaker/penalty_*/*/per_speaker_wer -maxdepth 1 -name "wer_*" -delete + + # Compute overall WER average + cat $dev_decodedir/scoring_kaldi_multispeaker/best_wer | awk ' + { + ERR+=$5; WC+=$7; INS+=$8; DEL+=$10; SUB+=$12; + }END{ + WER=ERR*100/WC; + printf("%%WER %.2f [ %d / %d, %d ins, %d del, %d sub ]",WER,ERR,WC,INS,DEL,SUB); + } + ' > $dev_decodedir/scoring_kaldi_multispeaker/best_wer_average +fi + +# Now scoring the eval set using best LMWT and WIP + +if [ $stage -le 4 ]; then + # obtaining per recording stats for eval + best_lmwt="$(cat $dev_decodedir/scoring_kaldi_multispeaker/lmwt)" + best_wip="$(cat $dev_decodedir/scoring_kaldi_multispeaker/wip)" + local/multispeaker_score.sh \ + --multistream $multistream \ + --datadir $eval_datadir data/$eval_datadir/text \ + $eval_decodedir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \ + $eval_decodedir/scoring_kaldi_multispeaker/penalty_$best_wip/$best_lmwt/ +fi + +if [ $stage -le 5 ]; then + # obtaining eval wer corresponding to best lmwt, best_array and wip of dev + best_lmwt="$(cat $dev_decodedir/scoring_kaldi_multispeaker/lmwt)" + best_wip="$(cat $dev_decodedir/scoring_kaldi_multispeaker/wip)" + + find $eval_decodedir/scoring_kaldi_multispeaker/penalty_$best_wip/$best_lmwt/per_speaker_wer -maxdepth 1 -name "wer_*" -delete + + # Compute the average WER stats for all conditions individually. + wer_dir=$eval_decodedir/scoring_kaldi_multispeaker/penalty_$best_wip/$best_lmwt/per_speaker_wer + for cond in $conditions; do + grep $cond $wer_dir/best_wer_all | awk -v COND="$cond" ' + { + ERR+=$5; WC+=$7; INS+=$8; DEL+=$10; SUB+=$12; + }END{ + WER=ERR*100/WC; + printf("%s %%WER %.2f [ %d / %d, %d ins, %d del, %d sub ]\n",COND,WER,ERR,WC,INS,DEL,SUB); + } + ' + done > $eval_decodedir/scoring_kaldi_multispeaker/best_wer + + # Compute overall WER average + cat $wer_dir/best_wer_all | awk ' + { + ERR+=$5; WC+=$7; INS+=$8; DEL+=$10; SUB+=$12; + }END{ + WER=ERR*100/WC; + printf("%%WER %.2f [ %d / %d, %d ins, %d del, %d sub ]",WER,ERR,WC,INS,DEL,SUB); + } + ' > $eval_decodedir/scoring_kaldi_multispeaker/best_wer_average +fi + +# printing dev and eval wer +echo "Dev WERs:" +cat $dev_decodedir/scoring_kaldi_multispeaker/best_wer +echo "Eval WERs:" +cat $eval_decodedir/scoring_kaldi_multispeaker/best_wer + diff --git a/egs/libri_css/s5_mono/local/score_reco_oracle.sh b/egs/libri_css/s5_mono/local/score_reco_oracle.sh new file mode 100755 index 00000000000..e3dc1369a46 --- /dev/null +++ b/egs/libri_css/s5_mono/local/score_reco_oracle.sh @@ -0,0 +1,107 @@ +#!/usr/bin/env bash +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey, Yenda Trmal) +# Copyright 2019 Johns Hopkins University (Author: Shinji Watanabe) +# Apache 2.0 +# +# This script scores the multi-speaker LibriCSS recordings. +# It first calculates the best search parameter configurations by using the dev set +# and then uses these to score both sets. + +cmd=run.pl +dev=exp/chain_cleaned/tdnn_1d_sp/decode_dev +eval=exp/chain_cleaned/tdnn_1d_sp/decode_eval + +conditions="0L 0S OV10 OV20 OV30 OV40" + +echo "$0 $@" # Print the command line for logging +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 0 ]; then + echo "Usage: $0 [--cmd (run.pl|queue.pl...)]" + echo "This script scores the LibriCSS full recordings" + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --dev # dev set decoding directory" + echo " --eval # eval set decoding directory" + exit 1; +fi + +# get language model weight and word insertion penalty from the dev set +best_lmwt=`cat $dev/scoring_kaldi/wer_details/lmwt` +best_wip=`cat $dev/scoring_kaldi/wer_details/wip` + +echo "best LM weight: $best_lmwt" +echo "insertion penalty weight: $best_wip" + +echo "==== development set ====" +# development set +# we report scores by overlap type, i.e., 0L, 0S, OV10, and so on. + +# get the scores per utterance +score_result=$dev/scoring_kaldi/wer_details/per_utt + +for cond in $conditions; do + # get nerror + nerr=`grep "\#csid" $score_result | grep $cond | awk '{sum+=$4+$5+$6} END {print sum}'` + # get nwords from references (NF-2 means to exclude utterance id and " ref ") + nwrd=`grep "\#csid" $score_result | grep $cond | awk '{sum+=$3+$4+$6} END {print sum}'` + # compute wer with scale=2 + wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` + # report the results + echo -n "Condition $cond: " + echo -n "#words $nwrd, " + echo -n "#errors $nerr, " + echo "wer $wer %" +done + +echo -n "overall: " +# get nerror +nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'` +# get nwords from references (NF-2 means to exclude utterance id and " ref ") +nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'` +# compute wer with scale=2 +wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` +echo -n "#words $nwrd, " +echo -n "#errors $nerr, " +echo "wer $wer %" + +echo "==== evaluation set ====" +# evaluation set +# get the scoring result per utterance. Copied from local/score.sh +mkdir -p $eval/scoring_kaldi/wer_details_devbest +$cmd $eval/scoring_kaldi/log/stats1.log \ + cat $eval/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \ + align-text --special-symbol="'***'" ark:$eval/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \ + utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \> $eval/scoring_kaldi/wer_details_devbest/per_utt + +score_result=$eval/scoring_kaldi/wer_details_devbest/per_utt + +for cond in $conditions; do + # get nerror + nerr=`grep "\#csid" $score_result | grep $cond | awk '{sum+=$4+$5+$6} END {print sum}'` + # get nwords from references (NF-2 means to exclude utterance id and " ref ") + nwrd=`grep "\#csid" $score_result | grep $cond | awk '{sum+=$3+$4+$6} END {print sum}'` + # compute wer with scale=2 + wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` + + # report the results + echo -n "Condition $cond: " + echo -n "#words $nwrd, " + echo -n "#errors $nerr, " + echo "wer $wer %" +done + +echo -n "overall: " +# get nerror +nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'` +# get nwords from references (NF-2 means to exclude utterance id and " ref ") +nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'` +# compute wer with scale=2 +wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` +echo -n "overall: " +echo -n "#words $nwrd, " +echo -n "#errors $nerr, " +echo "wer $wer %" + + diff --git a/egs/libri_css/s5_mono/local/segmentation/apply_webrtcvad.py b/egs/libri_css/s5_mono/local/segmentation/apply_webrtcvad.py new file mode 100755 index 00000000000..2fec7e575e6 --- /dev/null +++ b/egs/libri_css/s5_mono/local/segmentation/apply_webrtcvad.py @@ -0,0 +1,212 @@ +#!/usr/bin/python3 +# +# This script gets speech segments from whole recordings using webrtcvad +# Modified from: https://github.com/wiseman/py-webrtcvad/blob/master/example.py +# +# Copyright 2020 Johns Hopkins University (Author: Desh Raj) +# Apache 2.0 + +import collections, sys, os, argparse, contextlib +import wave +import webrtcvad + +def get_args(): + parser = argparse.ArgumentParser(description="Obtain speech segments for all wav files in a dir." + " Writes the output to the stdout." + "Usage: apply_webrtcvad.py [options...] " + "E.g.: apply_webrtcvad.py --aggressiveness 2 --reco2channels data/reco2channels data", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument("--mode", type=int, dest = "mode", default=1, + help="Integer in {0,1,2,3} specifying the VAD aggressiveness. 0 is the least aggressive" + " about filtering out non-speech, 3 is the most aggressive.") + + parser.add_argument("--reco2channels", type=str, dest="reco2ch_file", + help="In multi-channel setting, specifying this would avoid computing VAD for each channel" + " separately. Only first channel will be used to compute VAD and all channels will share.") + + parser.add_argument("data_dir", help="Data directory containing wav.scp") + + args = parser.parse_args() + + return args + +def check_args(args): + if (args.mode not in [0,1,2,3]): + raise Exception("Aggressiveness mode must be in {0,1,2,3}") + if (not os.path.exists(os.path.join(args.data_dir,'wav.scp'))): + raise Exception("No wav.scp file exists") + return + +def read_wave(path): + """Reads a .wav file. + Takes the path, and returns (PCM audio data, sample rate). + """ + with contextlib.closing(wave.open(path, 'rb')) as wf: + num_channels = wf.getnchannels() + assert num_channels == 1 + sample_width = wf.getsampwidth() + assert sample_width == 2 + sample_rate = wf.getframerate() + assert sample_rate in (8000, 16000, 32000, 48000) + pcm_data = wf.readframes(wf.getnframes()) + return pcm_data, sample_rate + + +class Frame(object): + """Represents a "frame" of audio data.""" + def __init__(self, bytes, timestamp, duration): + self.bytes = bytes + self.timestamp = timestamp + self.duration = duration + + +def frame_generator(frame_duration_ms, audio, sample_rate): + """Generates audio frames from PCM audio data. + Takes the desired frame duration in milliseconds, the PCM data, and + the sample rate. + Yields Frames of the requested duration. + """ + n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) + offset = 0 + timestamp = 0.0 + duration = (float(n) / sample_rate) / 2.0 + while offset + n < len(audio): + yield Frame(audio[offset:offset + n], timestamp, duration) + timestamp += duration + offset += n + + +def vad_segments(sample_rate, frame_duration_ms, + padding_duration_ms, vad, frames): + """Filters out non-voiced audio frames. + Given a webrtcvad.Vad and a source of audio frames, yields only + the voiced audio. + Uses a padded, sliding window algorithm over the audio frames. + When more than 90% of the frames in the window are voiced (as + reported by the VAD), the collector triggers and begins yielding + audio frames. Then the collector waits until 90% of the frames in + the window are unvoiced to detrigger. + The window is padded at the front and back to provide a small + amount of silence or the beginnings/endings of speech around the + voiced frames. + Arguments: + sample_rate - The audio sample rate, in Hz. + frame_duration_ms - The frame duration in milliseconds. + padding_duration_ms - The amount to pad the window, in milliseconds. + vad - An instance of webrtcvad.Vad. + frames - a source of audio frames (sequence or generator). + Returns: List of (start_time,end_time) tuples. + """ + num_padding_frames = int(padding_duration_ms / frame_duration_ms) + # We use a deque for our sliding window/ring buffer. + ring_buffer = collections.deque(maxlen=num_padding_frames) + # We have two states: TRIGGERED and NOTTRIGGERED. We start in the + # NOTTRIGGERED state. + triggered = False + segments = [] + voiced_frames = [] + for frame in frames: + is_speech = vad.is_speech(frame.bytes, sample_rate) + + if not triggered: + ring_buffer.append((frame, is_speech)) + num_voiced = len([f for f, speech in ring_buffer if speech]) + # If we're NOTTRIGGERED and more than 90% of the frames in + # the ring buffer are voiced frames, then enter the + # TRIGGERED state. + if num_voiced > 0.9 * ring_buffer.maxlen: + triggered = True + for f, s in ring_buffer: + voiced_frames.append(f) + start_time = voiced_frames[0].timestamp + ring_buffer.clear() + else: + # We're in the TRIGGERED state, so collect the audio data + # and add it to the ring buffer. + voiced_frames.append(frame) + ring_buffer.append((frame, is_speech)) + num_unvoiced = len([f for f, speech in ring_buffer if not speech]) + # If more than 90% of the frames in the ring buffer are + # unvoiced, then enter NOTTRIGGERED and yield whatever + # audio we've collected. + if num_unvoiced > 0.9 * ring_buffer.maxlen: + end_time = frame.timestamp + frame.duration + triggered = False + ring_buffer.clear() + voiced_frames = [] + # Write to segments list + segments.append((start_time, end_time)) + # If we have any leftover voiced audio when we run out of input, + # add it to segments list. + if voiced_frames: + end_time = voiced_frames[-1].timestamp + segments.append((start_time, end_time)) + return segments + + +def get_reco2channels(reco2ch_file): + """ + Given a file containing reco id and channel ids for the recording, return + the corresponding dictionary. + """ + reco2channels = {} + with open(reco2ch_file, 'r') as f: + for line in f.readlines(): + reco, channels = line.strip.split(maxsplit=1) + channels = channels.split() + reco2channels[reco] = channels + return reco2channels + +def get_wav_list(data_dir, reco2channels=None): + """ + Return a dictionary of uttid with wav paths. Optionally takes reco2channels and, + if provided, the uttid is actually the recoid. + """ + if reco2channels is not None: + keep_wavs = {reco2channels[reco][0]:reco for reco in reco2channels.keys()} + wav_list = {} + with open(os.path.join(data_dir,'wav.scp'),'r') as f: + for line in f.readlines(): + utt, wav = line.strip().split() + if reco2channels is not None: + if utt in keep_wavs: + wav_list[keep_wavs[utt]] = wav + else: + wav_list[utt] = wav + return wav_list + +def get_speech_segments(uttid, wav, vad): + """ + Compute and print the segments for the given uttid. It is in the format: + + """ + audio, sample_rate = read_wave(wav) + frames = frame_generator(30, audio, sample_rate) + frames = list(frames) + segments = vad_segments(sample_rate, 30, 300, vad, frames) + for segment in segments: + start = float("{:.2f}".format(segment[0])) + end = float("{:.2f}".format(segment[1])) + segment_id = '{}_{}_{}'.format(uttid,'{:.0f}'.format(100*start).zfill(6), '{:.0f}'.format(100*end).zfill(6)) + print ("{} {} {} {}".format(segment_id, uttid, start, end)) + return + +def main(): + # First we read and check the arguments + args = get_args() + check_args(args) + + if (args.reco2ch_file is not None): + reco2channels = get_reco2channels(args.reco2ch_file) + wav_list = get_wav_list(args.data_dir, reco2channels) + else: + wav_list = get_wav_list(args.data_dir) + + vad = webrtcvad.Vad(args.mode) + for utt in wav_list.keys(): + get_speech_segments(utt, wav_list[utt], vad) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/egs/libri_css/s5_mono/local/segmentation/detect_speech_activity.sh b/egs/libri_css/s5_mono/local/segmentation/detect_speech_activity.sh new file mode 100755 index 00000000000..c9719d472f3 --- /dev/null +++ b/egs/libri_css/s5_mono/local/segmentation/detect_speech_activity.sh @@ -0,0 +1,217 @@ +#!/usr/bin/env bash + +# Copyright 2016-17 Vimal Manohar +# 2017 Nagendra Kumar Goel +# Apache 2.0. + +# This script does nnet3-based speech activity detection given an input +# kaldi data directory and outputs a segmented kaldi data directory. +# This script can also do music detection and other similar segmentation +# using appropriate options such as --output-name output-music. + +set -e +set -o pipefail +set -u + +if [ -f ./path.sh ]; then . ./path.sh; fi + +affix= # Affix for the segmentation +nj=32 +cmd=queue.pl +stage=-1 + +# Feature options (Must match training) +mfcc_config=conf/mfcc_hires.conf +feat_affix= # Affix for the type of feature used + +output_name=output # The output node in the network +sad_name=sad # Base name for the directory storing the computed loglikes + # Can be music for music detection +segmentation_name=segmentation # Base name for the directory doing segmentation + # Can be segmentation_music for music detection + +# SAD network config +iter=final # Model iteration to use + +# Contexts must ideally match training for LSTM models, but +# may not necessarily for stats components +extra_left_context=0 # Set to some large value, typically 40 for LSTM (must match training) +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 +frames_per_chunk=150 + +# Decoding options +graph_opts="--min-silence-duration=0.03 --min-speech-duration=0.3 --max-speech-duration=10.0" +acwt=0.3 + +# These _in__weight represent the fraction of probability +# to transfer to class. +# e.g. --speech-in-sil-weight=0.0 --garbage-in-sil-weight=0.0 --sil-in-speech-weight=0.0 --garbage-in-speech-weight=0.3 +transform_probs_opts="" + +# Postprocessing options +segment_padding=0.2 # Duration (in seconds) of padding added to segments +min_segment_dur=0 # Minimum duration (in seconds) required for a segment to be included + # This is before any padding. Segments shorter than this duration will be removed. + # This is an alternative to --min-speech-duration above. +merge_consecutive_max_dur=0 # Merge consecutive segments as long as the merged segment is no longer than this many + # seconds. The segments are only merged if their boundaries are touching. + # This is after padding by --segment-padding seconds. + # 0 means do not merge. Use 'inf' to not limit the duration. + +echo $* + +. utils/parse_options.sh + +if [ $# -ne 5 ]; then + echo "This script does nnet3-based speech activity detection given an input kaldi " + echo "data directory and outputs an output kaldi data directory." + echo "See script for details of the options to be supplied." + echo "Usage: $0 " + echo " e.g.: $0 ~/workspace/egs/ami/s5b/data/sdm1/dev exp/nnet3_sad_snr/nnet_tdnn_j_n4 \\" + echo " mfcc_hires exp/segmentation_sad_snr/nnet_tdnn_j_n4 data/ami_sdm1_dev" + echo "" + echo "Options: " + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --nj # number of parallel jobs to run." + echo " --stage # stage to do partial re-run from." + echo " --convert-data-dir-to-whole # If true, the input data directory is " + echo " # first converted to whole data directory (i.e. whole recordings) " + echo " # and segmentation is done on that." + echo " # If false, then the original segments are " + echo " # retained and they are split into sub-segments." + echo " --output-name # The output node in the network" + echo " --extra-left-context # Set to some large value, typically 40 for LSTM (must match training)" + echo " --extra-right-context # For BLSTM or statistics pooling" + exit 1 +fi + +src_data_dir=$1 # The input data directory that needs to be segmented. + # If convert_data_dir_to_whole is true, any segments in that will be ignored. +sad_nnet_dir=$2 # The SAD neural network +mfcc_dir=$3 # The directory to store the features +dir=$4 # Work directory +data_dir=$5 # The output data directory will be ${data_dir}_seg + +affix=${affix:+_$affix} +feat_affix=${feat_affix:+_$feat_affix} + +data_id=`basename $data_dir` +sad_dir=${dir}/${sad_name}${affix}_${data_id}${feat_affix} +seg_dir=${dir}/${segmentation_name}${affix}_${data_id}${feat_affix} +test_data_dir=data/${data_id}${feat_affix} + +############################################################################### +## Forward pass through the network network and dump the log-likelihoods. +############################################################################### + +frame_subsampling_factor=1 +if [ -f $sad_nnet_dir/frame_subsampling_factor ]; then + frame_subsampling_factor=$(cat $sad_nnet_dir/frame_subsampling_factor) +fi + +mkdir -p $dir +if [ $stage -le 1 ]; then + if [ "$(readlink -f $sad_nnet_dir)" != "$(readlink -f $dir)" ]; then + cp $sad_nnet_dir/cmvn_opts $dir || exit 1 + fi + + ######################################################################## + ## Initialize neural network for decoding using the output $output_name + ######################################################################## + + if [ ! -z "$output_name" ] && [ "$output_name" != output ]; then + $cmd $dir/log/get_nnet_${output_name}.log \ + nnet3-copy --edits="rename-node old-name=$output_name new-name=output" \ + $sad_nnet_dir/$iter.raw $dir/${iter}_${output_name}.raw || exit 1 + iter=${iter}_${output_name} + else + if ! diff $sad_nnet_dir/$iter.raw $dir/$iter.raw; then + cp $sad_nnet_dir/$iter.raw $dir/ + fi + fi + + steps/nnet3/compute_output.sh --nj $nj --cmd "$cmd" \ + --iter ${iter} \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk $frames_per_chunk --apply-exp true \ + --frame-subsampling-factor $frame_subsampling_factor \ + ${test_data_dir} $dir $sad_dir || exit 1 +fi + +############################################################################### +## Prepare FST we search to make speech/silence decisions. +############################################################################### + +utils/data/get_utt2dur.sh --nj $nj --cmd "$cmd" $test_data_dir || exit 1 +frame_shift=$(utils/data/get_frame_shift.sh $test_data_dir) || exit 1 + +graph_dir=${dir}/graph_${output_name} +if [ $stage -le 2 ]; then + mkdir -p $graph_dir + + # 1 for silence and 2 for speech + cat < $graph_dir/words.txt + 0 +silence 1 +speech 2 +EOF + + $cmd $graph_dir/log/make_graph.log \ + steps/segmentation/internal/prepare_sad_graph.py $graph_opts \ + --frame-shift=$(perl -e "print $frame_shift * $frame_subsampling_factor") - \| \ + fstcompile --isymbols=$graph_dir/words.txt --osymbols=$graph_dir/words.txt '>' \ + $graph_dir/HCLG.fst +fi + +############################################################################### +## Do Viterbi decoding to create per-frame alignments. +############################################################################### + +post_vec=$sad_nnet_dir/post_${output_name}.vec +if [ ! -f $sad_nnet_dir/post_${output_name}.vec ]; then + if [ ! -f $sad_nnet_dir/post_${output_name}.txt ]; then + echo "$0: Could not find $sad_nnet_dir/post_${output_name}.vec. " + echo "Re-run the corresponding stage in the training script possibly " + echo "with --compute-average-posteriors=true or compute the priors " + echo "from the training labels" + exit 1 + else + post_vec=$sad_nnet_dir/post_${output_name}.txt + fi +fi + +mkdir -p $seg_dir +if [ $stage -le 3 ]; then + steps/segmentation/internal/get_transform_probs_mat.py \ + --priors="$post_vec" $transform_probs_opts > $seg_dir/transform_probs.mat + + steps/segmentation/decode_sad.sh --acwt $acwt --cmd "$cmd" \ + --nj $nj \ + --transform "$seg_dir/transform_probs.mat" \ + $graph_dir $sad_dir $seg_dir +fi + +############################################################################### +## Post-process segmentation to create kaldi data directory. +############################################################################### + +if [ $stage -le 4 ]; then + steps/segmentation/post_process_sad_to_segments.sh \ + --segment-padding $segment_padding --min-segment-dur $min_segment_dur \ + --merge-consecutive-max-dur $merge_consecutive_max_dur \ + --cmd "$cmd" --frame-shift $(perl -e "print $frame_subsampling_factor * $frame_shift") \ + ${test_data_dir} ${seg_dir} ${seg_dir} +fi + +if [ $stage -le 5 ]; then + utils/data/subsegment_data_dir.sh ${test_data_dir} ${seg_dir}/segments \ + ${data_dir}_seg +fi + +echo "$0: Created output segmented kaldi data directory in ${data_dir}_seg" +exit 0 diff --git a/egs/libri_css/s5_mono/local/train_asr.sh b/egs/libri_css/s5_mono/local/train_asr.sh new file mode 100755 index 00000000000..99043607f2a --- /dev/null +++ b/egs/libri_css/s5_mono/local/train_asr.sh @@ -0,0 +1,205 @@ +# This script is called from run.sh. It downloads the Librispeech +# data and trains an ASR model on it. + +nj=50 +stage=0 + +. ./utils/parse_options.sh + +echo >&2 "$0" "$@" +if [ $# -ne 1 ] ; then + echo >&2 "$0" "$@" + echo >&2 "$0: Error: wrong number of arguments" + echo -e >&2 "Usage:\n $0 [opts] " + echo -e >&2 "eg:\n $0 /export/corpora/Librispeech" + exit 1 +fi +data=$1 + +train_sets="train_clean_100 train_clean_360 train_other_500" + +. ./cmd.sh +. ./path.sh + +set -e # exit on error + +# base url for downloads. +data_url=www.openslr.org/resources/12 +lm_url=www.openslr.org/resources/11 +mfccdir=mfcc + + +if [ $stage -le 1 ]; then + # download the data. + for part in train-clean-100 train-clean-360 train-other-500; do + # local/download_and_untar.sh $data $data_url $part + local/data_prep_librispeech.sh $data/$part \ + data/$(echo $part | sed s/-/_/g) + done +fi + +if [ $stage -le 2 ]; then + # spread the mfccs over various machines, as this data-set is quite large. + if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then + mfcc=$(basename mfccdir) # in case was absolute pathname (unlikely), get basename. + utils/create_split_dir.pl /export/b{02,11,12,13}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \ + $mfccdir/storage + fi +fi + + +if [ $stage -le 3 ]; then + for part in $train_sets; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj data/$part exp/make_mfcc/$part $mfccdir + steps/compute_cmvn_stats.sh data/$part exp/make_mfcc/$part $mfccdir + done +fi + +if [ $stage -le 4 ]; then + # Make some small data subsets for early system-build stages. Note, there are 29k + # utterances in the train_clean_100 directory which has 100 hours of data. + # For the monophone stages we select the shortest utterances, which should make it + # easier to align the data from a flat start. + + utils/subset_data_dir.sh --shortest data/train_clean_100 2000 data/train_2kshort + utils/subset_data_dir.sh data/train_clean_100 5000 data/train_5k + utils/subset_data_dir.sh data/train_clean_100 10000 data/train_10k + + # We also combine the clean data which will be used to train the larger SAT model + utils/combine_data.sh \ + data/train_clean_460 data/train_clean_100 data/train_clean_360 + + # And combine all 960h data, which will be used to train the nnet + utils/combine_data.sh \ + data/train_960 data/train_clean_460 data/train_other_500 +fi + +if [ $stage -le 5 ]; then + # download the LM resources + local/download_lm.sh $lm_url data/local/lm +fi + +if [ $stage -le 6 ]; then + # when the "--stage 3" option is used below we skip the G2P steps, and use the + # lexicon we have already downloaded from openslr.org/11/ + local/prepare_dict.sh --stage 3 --nj 30 --cmd "$train_cmd" \ + data/local/lm data/local/lm data/local/dict_nosp + + utils/prepare_lang.sh data/local/dict_nosp \ + "" data/local/lang_tmp_nosp data/lang_nosp + + local/format_lms.sh --src-dir data/lang_nosp data/local/lm +fi + +if [ $stage -le 7 ]; then + # Create ConstArpaLm format language model for full 3-gram and 4-gram LMs + utils/build_const_arpa_lm.sh data/local/lm/lm_tglarge.arpa.gz \ + data/lang_nosp data/lang_nosp_test_tglarge + utils/build_const_arpa_lm.sh data/local/lm/lm_fglarge.arpa.gz \ + data/lang_nosp data/lang_nosp_test_fglarge +fi + +if [ $stage -le 8 ]; then + # train a monophone system + steps/train_mono.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \ + data/train_2kshort data/lang_nosp exp/mono +fi + +if [ $stage -le 9 ]; then + steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \ + data/train_5k data/lang_nosp exp/mono exp/mono_ali_5k + + # train a first delta + delta-delta triphone system on a subset of 5000 utterances + steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ + 2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1 +fi + +if [ $stage -le 10 ]; then + steps/align_si.sh --nj 10 --cmd "$train_cmd" \ + data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k + + + # train an LDA+MLLT system. + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" 2500 15000 \ + data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2b +fi + +if [ $stage -le 11 ]; then + # Align a 10k utts subset using the tri2b model + steps/align_si.sh --nj 10 --cmd "$train_cmd" --use-graphs true \ + data/train_10k data/lang_nosp exp/tri2b exp/tri2b_ali_10k + + # Train tri3b, which is LDA+MLLT+SAT on 10k utts + steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \ + data/train_10k data/lang_nosp exp/tri2b_ali_10k exp/tri3b +fi + +if [ $stage -le 12 ]; then + # align the entire train_clean_100 subset using the tri3b model + steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \ + data/train_clean_100 data/lang_nosp \ + exp/tri3b exp/tri3b_ali_clean_100 + + # train another LDA+MLLT+SAT system on the entire 100 hour subset + steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \ + data/train_clean_100 data/lang_nosp \ + exp/tri3b_ali_clean_100 exp/tri4b +fi + +if [ $stage -le 13 ]; then + # Now we compute the pronunciation and silence probabilities from training data, + # and re-create the lang directory. + steps/get_prons.sh --cmd "$train_cmd" \ + data/train_clean_100 data/lang_nosp exp/tri4b + utils/dict_dir_add_pronprobs.sh --max-normalize true \ + data/local/dict_nosp \ + exp/tri4b/pron_counts_nowb.txt exp/tri4b/sil_counts_nowb.txt \ + exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict + + utils/prepare_lang.sh data/local/dict \ + "" data/local/lang_tmp data/lang + local/format_lms.sh --src-dir data/lang data/local/lm + + utils/build_const_arpa_lm.sh \ + data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge + utils/build_const_arpa_lm.sh \ + data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge +fi + +if [ $stage -le 14 ]; then + # align the 460h clean set using the tri4b model + steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \ + data/train_clean_460 data/lang exp/tri4b exp/tri4b_ali_clean_460 + + # create a larger SAT model, trained on the 460 hours of data. + steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \ + data/train_clean_460 data/lang exp/tri4b_ali_clean_460 exp/tri5b +fi + +if [ $stage -le 15 ]; then + steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \ + data/train_960 data/lang exp/tri5b exp/tri5b_ali_960 + + # train a SAT model on the 960 hour mixed data. Use the train_quick.sh script + # as it is faster. + steps/train_quick.sh --cmd "$train_cmd" \ + 7000 150000 data/train_960 data/lang exp/tri5b_ali_960 exp/tri6b +fi + +if [ $stage -le 16 ]; then + # this does some data-cleaning. The cleaned data should be useful when we add + # the neural net and chain systems. (although actually it was pretty clean already.) + local/run_cleanup_segmentation.sh +fi + +if [ $stage -le 17 ]; then + # train and test nnet3 tdnn models on the entire data with data-cleaning. + local/chain/run_tdnn.sh + + # Fine tune with reverberated Librispeech data + local/chain/tuning/run_tdnn_1d_ft.sh +fi + + +exit 0 \ No newline at end of file diff --git a/egs/libri_css/s5_mono/local/train_diarizer.sh b/egs/libri_css/s5_mono/local/train_diarizer.sh new file mode 100755 index 00000000000..6fc7156cf8b --- /dev/null +++ b/egs/libri_css/s5_mono/local/train_diarizer.sh @@ -0,0 +1,186 @@ +#!/usr/bin/env bash +# Copyright +# 2019 David Snyder +# Apache 2.0. +# +# This script is based on the run.sh script in the Voxceleb v2 recipe. +# It trains an x-vector DNN for diarization. + +mfccdir=`pwd`/mfcc +vaddir=`pwd`/mfcc + +voxceleb1_root=/export/corpora/VoxCeleb1 +voxceleb2_root=/export/corpora/VoxCeleb2 +data_dir=train_other_500 +model_dir=exp/xvector_nnet_1a + +stage=0 +train_stage=-1 + +. ./cmd.sh + +if [ -f ./path.sh ]; then . ./path.sh; fi +set -e -u -o pipefail +. utils/parse_options.sh + +if [ $# -ne 0 ]; then + exit 1 +fi + +if [ $stage -le 0 ]; then + echo "$0: preparing voxceleb 2 data" + local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train + local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test + + echo "$0: preparing voxceleb 1 data (see comments if this step fails)" + # The format of the voxceleb 1 corpus has changed several times since it was + # released. Therefore, our dataprep scripts may or may not fail depending + # on the version of the corpus you obtained. + # If you downloaded the corpus soon after it was first released, this + # version of the dataprep script might work: + local/make_voxceleb1.pl $voxceleb1_root data/voxceleb1 + # However, if you've downloaded the corpus recently, you may need to use the + # the following scripts instead: + #local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train + #local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test + + # We should now have about 7,351 speakers and 1,277,503 utterances. + utils/combine_data.sh data/voxceleb data/voxceleb2_train data/voxceleb2_test +fi + +if [ $stage -le 1 ]; then + echo "$0: preparing features for training data (voxceleb 1 + 2)" + steps/make_mfcc.sh --write-utt2num-frames true \ + --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \ + data/voxceleb exp/make_mfcc $mfccdir + utils/fix_data_dir.sh data/voxceleb + # Note that we apply CMN to the MFCCs and write these to the disk. These + # features will later be used to train the x-vector DNN. +fi + +# In this section, we augment the voxceleb data with reverberation. +# Note that we can probably improve the x-vector DNN if we include +# augmentations from the nonspeech regions of the Chime 6 training +# dataset. +if [ $stage -le 2 ]; then + echo "$0: applying augmentation to x-vector training data (just reverb for now)" + frame_shift=0.01 + awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/voxceleb/utt2num_frames > data/voxceleb/reco2dur + + if [ ! -d "RIRS_NOISES" ]; then + echo "$0: downloading simulated room impulse response dataset" + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + + # Make a version with reverberated speech + rvb_opts=() + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + + # Make a reverberated version of the training data. Note that we don't add any + # additive noise here. + steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --speech-rvb-probability 1 \ + --pointsource-noise-addition-probability 0 \ + --isotropic-noise-addition-probability 0 \ + --num-replications 1 \ + --source-sampling-rate 16000 \ + data/voxceleb data/voxceleb_reverb + utils/copy_data_dir.sh --utt-suffix "-reverb" data/voxceleb_reverb data/voxceleb_reverb.new + rm -rf data/voxceleb_reverb + mv data/voxceleb_reverb.new data/voxceleb_reverb +fi + +if [ $stage -le 3 ]; then + echo "$0: making MFCCs for augmented training data" + # Make MFCCs for the augmented data. Note that we do not compute a new + # vad.scp file here. Instead, we use the vad.scp from the clean version of + # the list. + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \ + data/voxceleb_reverb exp/make_mfcc $mfccdir + # Combine the clean and augmented training data. This is now roughly + # double the size of the original clean list. + utils/combine_data.sh data/voxceleb_combined data/voxceleb_reverb data/voxceleb +fi + +# Now we prepare the features to generate examples for xvector training. +if [ $stage -le 4 ]; then + # This script applies CMVN and removes nonspeech frames. Note that this is somewhat + # wasteful, as it roughly doubles the amount of training data on disk. After + # creating voxceleb examples, this can be removed. + echo "$0: preparing features to train x-vector DNN" + local/nnet3/xvector/prepare_feats.sh --nj 40 --cmd "$train_cmd" \ + data/voxceleb_combined data/voxceleb_combined_cmn exp/voxceleb_combined_cmn + utils/fix_data_dir.sh data/voxceleb_combined_cmn +fi + +if [ $stage -le 5 ]; then + # Now, we need to remove features that are too short after removing silence + # frames. We want at least 4s (400 frames) per utterance. + min_len=400 + mv data/voxceleb_combined_cmn/utt2num_frames data/voxceleb_combined_cmn/utt2num_frames.bak + awk -v min_len=${min_len} '$2 > min_len {print $1, $2}' data/voxceleb_combined_cmn/utt2num_frames.bak > data/voxceleb_combined_cmn/utt2num_frames + utils/filter_scp.pl data/voxceleb_combined_cmn/utt2num_frames data/voxceleb_combined_cmn/utt2spk > data/voxceleb_combined_cmn/utt2spk.new + mv data/voxceleb_combined_cmn/utt2spk.new data/voxceleb_combined_cmn/utt2spk + utils/fix_data_dir.sh data/voxceleb_combined_cmn + + # We also want several utterances per speaker. Now we'll throw out speakers + # with fewer than 8 utterances. + min_num_utts=8 + awk '{print $1, NF-1}' data/voxceleb_combined_cmn/spk2utt > data/voxceleb_combined_cmn/spk2num + awk -v min_num_utts=${min_num_utts} '$2 >= min_num_utts {print $1, $2}' data/voxceleb_combined_cmn/spk2num | utils/filter_scp.pl - data/voxceleb_combined_cmn/spk2utt > data/voxceleb_combined_cmn/spk2utt.new + mv data/voxceleb_combined_cmn/spk2utt.new data/voxceleb_combined_cmn/spk2utt + utils/spk2utt_to_utt2spk.pl data/voxceleb_combined_cmn/spk2utt > data/voxceleb_combined_cmn/utt2spk + + utils/filter_scp.pl data/voxceleb_combined_cmn/utt2spk data/voxceleb_combined_cmn/utt2num_frames > data/voxceleb_combined_cmn/utt2num_frames.new + mv data/voxceleb_combined_cmn/utt2num_frames.new data/voxceleb_combined_cmn/utt2num_frames + + utils/fix_data_dir.sh data/voxceleb_combined_cmn +fi + +# Stages 6 through 8 are handled in run_xvector.sh. +# This script trains the x-vector DNN on the augmented voxceleb data. +local/nnet3/xvector/run_xvector.sh --stage $stage --train-stage $train_stage \ + --data data/voxceleb_combined_cmn --nnet-dir $model_dir \ + --egs-dir $model_dir/egs + +if [ $stage -le 9 ]; then + echo "$0: preparing a subset of Librispeech data to train PLDA model" + utils/subset_data_dir.sh ${data_dir} 100000 data/plda_train + steps/make_mfcc.sh --write-utt2num-frames true \ + --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \ + data/plda_train exp/make_mfcc $mfccdir + utils/fix_data_dir.sh data/plda_train + local/nnet3/xvector/prepare_feats.sh --nj 40 --cmd "$train_cmd" \ + data/plda_train data/plda_train_cmn exp/plda_train_cmn + if [ -f data/plda_train/segments ]; then + cp data/plda_train/segments data/plda_train_cmn/ + fi +fi + +if [ $stage -le 10 ]; then + echo "$0: extracting x-vector for PLDA training data" + utils/fix_data_dir.sh data/plda_train_cmn + diarization/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 10G" \ + --nj 40 --window 3.0 --period 10.0 --min-segment 1.5 --apply-cmn false \ + --hard-min true $model_dir \ + data/plda_train_cmn $model_dir/xvectors_plda_train +fi + +# Train PLDA models +if [ $stage -le 11 ]; then + echo "$0: training PLDA model" + $train_cmd $model_dir/xvectors_plda_train/log/plda.log \ + ivector-compute-plda ark:$model_dir/xvectors_plda_train/spk2utt \ + "ark:ivector-subtract-global-mean \ + scp:$model_dir/xvectors_plda_train/xvector.scp ark:- \ + | transform-vec $model_dir/xvectors_plda_train/transform.mat ark:- ark:- \ + | ivector-normalize-length ark:- ark:- |" \ + $model_dir/xvectors_plda_train/plda || exit 1; + cp $model_dir/xvectors_plda_train/plda $model_dir/ + cp $model_dir/xvectors_plda_train/transform.mat $model_dir/ + cp $model_dir/xvectors_plda_train/mean.vec $model_dir/ +fi diff --git a/egs/libri_css/s5_mono/local/wer_output_filter b/egs/libri_css/s5_mono/local/wer_output_filter new file mode 100755 index 00000000000..6f4b6400716 --- /dev/null +++ b/egs/libri_css/s5_mono/local/wer_output_filter @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (c) 2017 Johns Hopkins University (Author: Yenda Trmal ) +# Apache 2.0 + + +## Filter for scoring of the STT results. Convert everything to lowercase +## and add some ad-hoc fixes for the hesitations + +perl -e ' + while() { + @A = split(" ", $_); + $id = shift @A; print "$id "; + foreach $a (@A) { + print lc($a) . " " unless $a =~ /\[.*\]/; + } + print "\n"; + }' | \ +sed -e ' + s/\/hmm/g; + s/\/hmm/g; + s/\/hmm/g; +' + +#| uconv -f utf-8 -t utf-8 -x Latin-ASCII + diff --git a/egs/libri_css/s5_mono/path.sh b/egs/libri_css/s5_mono/path.sh new file mode 100644 index 00000000000..ab1a81a86ef --- /dev/null +++ b/egs/libri_css/s5_mono/path.sh @@ -0,0 +1,10 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH +export PATH=$PWD/dscore:$PATH +export PYTHONPATH="${PYTHONPATH}:$PWD/dscore" +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C +export BASH_ENV="~/.aliases" + diff --git a/egs/libri_css/s5_mono/rnnlm b/egs/libri_css/s5_mono/rnnlm new file mode 120000 index 00000000000..e136939ba72 --- /dev/null +++ b/egs/libri_css/s5_mono/rnnlm @@ -0,0 +1 @@ +../../../scripts/rnnlm/ \ No newline at end of file diff --git a/egs/libri_css/s5_mono/run.sh b/egs/libri_css/s5_mono/run.sh new file mode 100755 index 00000000000..7b5f4b8e350 --- /dev/null +++ b/egs/libri_css/s5_mono/run.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash +# +# LibriCSS monoaural baseline recipe. +# +# Copyright 2020 Johns Hopkins University (Author: Desh Raj) +# Apache 2.0 + +# Begin configuration section. +nj=50 +decode_nj=20 +stage=0 + +# Different stages +asr_stage=1 +diarizer_stage=0 +decode_stage=0 +rnnlm_rescore=true + +data_affix= # This can be used to distinguish between different data sources + +use_oracle_segments=false +wpe=false + +# End configuration section +. ./utils/parse_options.sh + +. ./cmd.sh +. ./path.sh + +test_sets="dev${data_affix} eval${data_affix}" + +set -e # exit on error + +# please change the path accordingly +libricss_corpus=/export/fs01/LibriCSS +librispeech_corpus=/export/corpora5/LibriSpeech/ + +########################################################################## +# We first prepare the LibriCSS data (monoaural) in the Kaldi data +# format. We use session 0 for dev and others for eval. +########################################################################## +if [ $stage -le 0 ]; then + local/data_prep_mono.sh --data-affix "$data_affix" $libricss_corpus $librispeech_corpus +fi + +######################################################################### +# ASR MODEL TRAINING +# In this stage, we prepare the Librispeech data and train our ASR model. +# This part is taken from the librispeech recipe, with parts related to +# decoding removed. We use the 100h clean subset to train most of the +# GMM models, except the SAT model, which is trained on the 460h clean +# subset. The nnet is trained on the full 960h (clean + other). +# To avoid training the whole ASR from scratch, you can download the +# chain model using: +# wget http://kaldi-asr.org/models/13/0013_librispeech_s5.tar.gz +# Once it is downloaded, extract using: tar -xvzf 0013_librispeech_s5.tar.gz +# and copy the contents of the exp/ directory to your exp/. +######################################################################### +if [ $stage -le 1 ]; then + local/train_asr.sh --stage $asr_stage --nj $nj $librispeech_corpus +fi + +########################################################################## +# DIARIZATION MODEL TRAINING +# You can also download a pretrained diarization model using: +# wget http://kaldi-asr.org/models/12/0012_diarization_v1.tar.gz +# Once it is downloaded, extract using: tar -xvzf 0012_diarization_v1.tar.gz +# and copy the contents of the exp/ directory to your exp/ +########################################################################## +if [ $stage -le 2 ]; then + local/train_diarizer.sh --stage $diarizer_stage \ + --data-dir data/train_other_500 \ + --model-dir exp/xvector_nnet_1a +fi + +########################################################################## +# RNNLM TRAINING +# We train a TDNN-LSTM based LM that will be used for rescoring the +# decoded lattices. +########################################################################## +if [ $stage -le 3 ]; then + local/rnnlm/train.sh --stage $rnnlm_stage +fi + +########################################################################## +# DECODING: We assume that we are just given the raw recordings (approx 10 +# mins each), without segments or speaker information, so we have to decode +# the whole pipeline, i.e., SAD -> Diarization -> ASR. This is done in the +# local/decode.sh script. +########################################################################## +if [ $stage -le 4 ]; then + local/decode.sh --stage $decode_stage \ + --test-sets "$test_sets" \ + --use-oracle-segments $use_oracle_segments \ + --rnnlm-rescore $rnnlm_rescore +fi + +exit 0; + diff --git a/egs/libri_css/s5_mono/sid b/egs/libri_css/s5_mono/sid new file mode 120000 index 00000000000..893a12f30c9 --- /dev/null +++ b/egs/libri_css/s5_mono/sid @@ -0,0 +1 @@ +../../sre08/v1/sid \ No newline at end of file diff --git a/egs/libri_css/s5_mono/steps b/egs/libri_css/s5_mono/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/libri_css/s5_mono/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/libri_css/s5_mono/utils b/egs/libri_css/s5_mono/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/libri_css/s5_mono/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file