Merge pull request kaldi-asr#3 from chimechallenge/sad

Track 2 pipeline with SAD and Diarization
desh2608 · Nov 18, 2019 · 87c0781 · 87c0781
2 parents 687b774 + ec31077
commit 87c0781
Show file tree

Hide file tree

Showing 35 changed files with 2,817 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -77,6 +77,8 @@ GSYMS
 /egs/*/*/plp
 /egs/*/*/exp
 /egs/*/*/data
+/egs/*/*/wav
+/egs/*/*/enhan
 
 # /tools/
 /tools/pocolm/

diff --git a/egs/chime6/README.txt b/egs/chime6/README.txt
@@ -0,0 +1,6 @@
+This is a kaldi recipe for the 6th CHiME Speech Separation and Recognition Challenge (CHiME-6).
+
+See http://spandh.dcs.shef.ac.uk/chime_challenge/ for more detailed information.
+
+s5_track1 : Track 1 of the challenge (oracle segments and speaker label is provided)
+s5_track2 : Track 2 of the challenge (only raw audio is provided)
diff --git a/egs/chime6/s5_track2/cmd.sh b/egs/chime6/s5_track2/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="retry.pl queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+
diff --git a/egs/chime6/s5_track2/conf/beamformit.cfg b/egs/chime6/s5_track2/conf/beamformit.cfg
@@ -0,0 +1,50 @@
+#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)
+
+# scrolling size to compute the delays
+scroll_size = 250
+
+# cross correlation computation window size
+window_size = 500
+
+#amount of maximum points for the xcorrelation taken into account
+nbest_amount = 4
+
+#flag wether to apply an automatic noise thresholding 
+do_noise_threshold = 1
+
+#Percentage of frames with lower xcorr taken as noisy
+noise_percent = 10
+
+######## acoustic modelling parameters
+
+#transition probabilities weight for multichannel decoding
+trans_weight_multi = 25
+trans_weight_nbest = 25
+
+###
+
+#flag wether to print the feaures after setting them, or not
+print_features = 1
+
+#flag wether to use the bad frames in the sum process
+do_avoid_bad_frames = 1
+
+#flag to use the best channel (SNR) as a reference
+#defined from command line
+do_compute_reference = 1
+
+#flag wether to use a uem file or not(process all the file)
+do_use_uem_file = 0
+
+#flag wether to use an adaptative weights scheme or fixed weights
+do_adapt_weights = 1
+
+#flag wether to output the sph files or just run the system to create the auxiliary files
+do_write_sph_files = 1
+
+####directories where to store/retrieve info####
+#channels_file = ./cfg-files/channels
+
+#show needs to be passed as argument normally, here a default one is given just in case
+#show_id = Ttmp
+
diff --git a/egs/chime6/s5_track2/conf/mfcc.conf b/egs/chime6/s5_track2/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false
+--sample-frequency=16000
diff --git a/egs/chime6/s5_track2/conf/mfcc_hires.conf b/egs/chime6/s5_track2/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 
+--num-mel-bins=40
+--num-ceps=40
+--low-freq=40
+--high-freq=-400
diff --git a/egs/chime6/s5_track2/conf/online_cmvn.conf b/egs/chime6/s5_track2/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/chime6/s5_track2/conf/sad.conf b/egs/chime6/s5_track2/conf/sad.conf
@@ -0,0 +1,2 @@
+affix=_1a
+nnet_type=stats
diff --git a/egs/chime6/s5_track2/diarization b/egs/chime6/s5_track2/diarization
@@ -0,0 +1 @@
+../../callhome_diarization/v1/diarization
diff --git a/egs/chime6/s5_track2/local/check_tools.sh b/egs/chime6/s5_track2/local/check_tools.sh
@@ -0,0 +1,71 @@
+#!/bin/bash -u
+
+# Copyright 2015 (c) Johns Hopkins University (Jan Trmal <jtrmal@gmail.com>)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+[ -f ./path.sh ] && . ./path.sh
+
+command -v uconv &>/dev/null \
+  || { echo  >&2 "uconv not found on PATH. You will have to install ICU4C"; exit 1; }
+
+command -v ngram &>/dev/null \
+  || { echo  >&2 "srilm not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_srilm.sh to install it"; exit 1; }
+
+if [  -z ${LIBLBFGS} ]; then
+  echo >&2  "SRILM is not compiled with the support of MaxEnt models."
+  echo >&2  "You should use the script in \$KALDI_ROOT/tools/install_srilm.sh"
+  echo >&2  "which will take care of compiling the SRILM with MaxEnt support"
+  exit 1;
+fi
+
+sox=`command -v sox 2>/dev/null` \
+  || { echo  >&2 "sox not found on PATH. Please install it manually (you will need version 14.4.0 and higher)."; exit 1; }
+
+# If sox is found on path, check if the version is correct
+if [ ! -z "$sox" ]; then
+  sox_version=`$sox --version 2>&1| head -1 | sed -e 's?.*: ??' -e 's?.* ??'`
+  if [[ ! $sox_version =~ v14.4.* ]]; then
+    echo "Unsupported sox version $sox_version found on path. You will need version v14.4.0 and higher."
+    exit 1
+  fi
+fi
+
+command -v phonetisaurus-align &>/dev/null \
+  || { echo  >&2 "Phonetisaurus not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_phonetisaurus.sh to install it"; exit 1; }
+
+command -v BeamformIt &>/dev/null \
+  || { echo  >&2 "BeamformIt not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_beamformit.sh to install it"; exit 1; }
+
+miniconda_dir=$HOME/miniconda3/
+if [ ! -d $miniconda_dir ]; then
+    echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh'"
+fi
+
+# check if WPE is installed
+result=`$miniconda_dir/bin/python -c "\
+try:
+    import nara_wpe
+    print('1')
+except ImportError:
+    print('0')"`
+
+if [ "$result" == "1" ]; then
+    continue
+else
+    echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh"
+    exit 1
+fi
+
+exit  0
diff --git a/egs/chime6/s5_track2/local/decode.sh b/egs/chime6/s5_track2/local/decode.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+#
+# This script decodes raw utterances through the entire pipeline: 
+# Feature extraction -> SAD -> Diarization -> ASR
+#
+# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal)
+#            2019  Desh Raj, David Snyder, Ashish Arora
+# Apache 2.0
+
+# Begin configuration section.
+nj=50
+decode_nj=20
+stage=0
+sad_stage=0
+diarizer_stage=0
+enhancement=
+test_sets=
+skip_scoring=false
+# End configuration section
+. ./utils/parse_options.sh
+
+. ./cmd.sh
+. ./path.sh
+. ./conf/sad.conf
+
+#######################################################################
+# Prepare the dev and eval data with dereverberation (WPE) and
+# beamforming.
+#######################################################################
+if [ $stage -le 0 ]; then
+  # Beamforming using reference arrays
+  # enhanced WAV directory
+  enhandir=enhan
+  dereverb_dir=${PWD}/wav/wpe/
+
+  for dset in dev eval; do
+    for mictype in u01 u02 u03 u04 u06; do
+      local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 120G" \
+            ${audio_dir}/${dset} \
+            ${dereverb_dir}/${dset} \
+            ${mictype}
+    done
+  done
+
+  for dset in dev eval; do
+    for mictype in u01 u02 u03 u04 u06; do
+      local/run_beamformit.sh --cmd "$train_cmd" \
+			      ${dereverb_dir}/${dset} \
+			      ${enhandir}/${dset}_${enhancement}_${mictype} \
+			      ${mictype}
+    done
+  done
+
+  for dset in dev eval; do
+    local/prepare_data.sh --mictype ref --train false \
+      "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \
+			${json_dir}/${dset} data/${dset}_${enhancement}_dereverb_ref 
+  done
+fi
+
+if [ $stage -le 1 ]; then
+  # mfccdir should be some place with a largish disk where you
+  # want to store MFCC features.
+  mfccdir=mfcc
+  for x in ${test_sets}; do
+    steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \
+      --mfcc-config conf/mfcc_hires.conf \
+		  data/$x exp/make_mfcc/$x $mfccdir
+  done
+fi
+
+#######################################################################
+# Perform SAD on the dev/eval data
+#######################################################################
+dir=exp/segmentation${affix}
+sad_work_dir=exp/sad${affix}_${nnet_type}/
+sad_nnet_dir=$dir/tdnn_${nnet_type}_sad_1a
+
+if [ $stage -le 2 ]; then
+  for datadir in ${test_sets}; do
+    test_set=data/${datadir}
+    if [ ! -f ${test_set}/wav.scp ]; then
+      echo "$0: Not performing SAD on ${test_set}"
+      exit 0
+    fi
+    # Perform segmentation
+    local/segmentation/detect_speech_activity.sh --nj 10 --stage $sad_stage \
+      $test_set $sad_nnet_dir mfcc $sad_work_dir \
+      data/${datadir} || exit 1
+
+    mv data/${datadir}_seg data/${datadir}_${nnet_type}_seg
+    # Generate RTTM file from segmentation performed by SAD. This can
+    # be used to evaluate the performance of the SAD as an intermediate
+    # step.
+    steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \
+      data/${datadir}_${nnet_type}_seg/utt2spk data/${datadir}_${nnet_type}_seg/segments \
+      data/${datadir}_${nnet_type}_seg/rttm
+  done
+fi
+
+#######################################################################
+# Perform diarization on the dev/eval data
+#######################################################################
+if [ $stage -le 3 ]; then
+  for datadir in ${test_sets}; do
+    local/diarize.sh --nj 10 --cmd "$train_cmd" --stage $diarizer_stage \
+      exp/xvector_nnet_1a \
+      data/${datadir}_${nnet_type}_seg \
+      exp/${datadir}_${nnet_type}_seg_diarization
+  done
+fi
+
+#######################################################################
+# Decode diarized output using trained chain model
+#######################################################################
+if [ $stage -le 4 ]; then
+  continue
+fi
+
+#######################################################################
+# Score decoded dev/eval sets
+#######################################################################
+if [ $skip_scoring == "false" ]; then
+  continue
+fi
+exit 0;
diff --git a/egs/chime6/s5_track2/local/diarize.sh b/egs/chime6/s5_track2/local/diarize.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+# Copyright   2019   David Snder
+# Apache 2.0.
+#
+# This script takes an input directory that has a segments file (and
+# a feats.scp file), and performs diarization on it.  The output directory
+# contains an RTTM file which can be used to resegment the input data.
+
+stage=0
+nj=10
+cmd="run.pl"
+ref_rttm=
+
+echo "$0 $@"  # Print the command line for logging
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+if [ $# != 3 ]; then
+  echo "Usage: $0 <model-dir>  <in-data-dir> <out-dir>"
+  echo "e.g.: $0 exp/xvector_nnet_1a  data/dev exp/dev_diarization"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs."
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --ref-rttm <path to reference RTTM>              # if present, used to score output RTTM."
+  exit 1;
+fi
+
+model_dir=$1
+data_in=$2
+out_dir=$3
+
+name=`basename $data_in`
+
+for f in $data_in/feats.scp $data_in/segments $model_dir/plda \
+  $model_dir/final.raw $model_dir/extract.config; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+if [ $stage -le 0 ]; then
+  echo "$0: computing features for x-vector extractor"
+  utils/fix_data_dir.sh data/${name}
+  rm -rf data/${name}_cmn
+  local/nnet3/xvector/prepare_feats.sh --nj $nj --cmd "$cmd" \
+    data/$name data/${name}_cmn exp/${name}_cmn
+  cp data/$name/segments exp/${name}_cmn/
+  utils/fix_data_dir.sh data/${name}_cmn
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: extracting x-vectors for all segments"
+  diarization/nnet3/xvector/extract_xvectors.sh --cmd "$cmd" \
+    --nj $nj --window 1.5 --period 0.75 --apply-cmn false \
+    --min-segment 0.5 $model_dir \
+    data/${name}_cmn $out_dir/xvectors_${name}
+fi
+
+# Perform PLDA scoring
+if [ $stage -le 2 ]; then
+  # Perform PLDA scoring on all pairs of segments for each recording.
+  echo "$0: performing PLDA scoring between all pairs of x-vectors"
+  diarization/nnet3/xvector/score_plda.sh --cmd "$cmd" \
+    --target-energy 0.5 \
+    --nj $nj $model_dir/ $out_dir/xvectors_${name} \
+    $out_dir/xvectors_${name}/plda_scores
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: performing clustering using PLDA scores (we assume 4 speakers per recording)"
+  awk '{print $1, "4"}' data/$name/wav.scp > data/$name/reco2num_spk
+  diarization/cluster.sh --cmd "$cmd" --nj $nj \
+    --reco2num-spk data/$name/reco2num_spk \
+    --rttm-channel 1 \
+    $out_dir/xvectors_${name}/plda_scores $out_dir
+  echo "$0: wrote RTTM to output directory ${out_dir}"
+fi
+
+if [ $stage -le 4 ]; then
+  if [ -f $ref_rttm ]; then
+    echo "$0: computing diariztion error rate (DER) using reference ${ref_rttm}"
+    mkdir -p $out_dir/tuning/
+    md-eval.pl -c 0.25 -1 -r $ref_rttm -s $out_dir/rttm 2> $out_dir/log/der.log > $out_dir/der
+    der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' ${out_dir}/der)
+    echo "DER: $der%"
+  fi
+fi
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh