Skip to content

Commit

Permalink
Merge pull request kaldi-asr#3 from chimechallenge/sad
Browse files Browse the repository at this point in the history
Track 2 pipeline with SAD and Diarization
  • Loading branch information
sw005320 authored Nov 18, 2019
2 parents 687b774 + ec31077 commit 87c0781
Show file tree
Hide file tree
Showing 35 changed files with 2,817 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ GSYMS
/egs/*/*/plp
/egs/*/*/exp
/egs/*/*/data
/egs/*/*/wav
/egs/*/*/enhan

# /tools/
/tools/pocolm/
Expand Down
6 changes: 6 additions & 0 deletions egs/chime6/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
This is a kaldi recipe for the 6th CHiME Speech Separation and Recognition Challenge (CHiME-6).

See http://spandh.dcs.shef.ac.uk/chime_challenge/ for more detailed information.

s5_track1 : Track 1 of the challenge (oracle segments and speaker label is provided)
s5_track2 : Track 2 of the challenge (only raw audio is provided)
15 changes: 15 additions & 0 deletions egs/chime6/s5_track2/cmd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# you can change cmd.sh depending on what type of queue you are using.
# If you have no queueing system and want to run on a local machine, you
# can change all instances 'queue.pl' to run.pl (but be careful and run
# commands one by one: most recipes will exhaust the memory on your
# machine). queue.pl works with GridEngine (qsub). slurm.pl works
# with slurm. Different queues are configured differently, with different
# queue names and different ways of specifying things like memory;
# to account for these differences you can create and edit the file
# conf/queue.conf to match your queue's configuration. Search for
# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.

export train_cmd="retry.pl queue.pl --mem 2G"
export decode_cmd="queue.pl --mem 4G"

50 changes: 50 additions & 0 deletions egs/chime6/s5_track2/conf/beamformit.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)

# scrolling size to compute the delays
scroll_size = 250

# cross correlation computation window size
window_size = 500

#amount of maximum points for the xcorrelation taken into account
nbest_amount = 4

#flag wether to apply an automatic noise thresholding
do_noise_threshold = 1

#Percentage of frames with lower xcorr taken as noisy
noise_percent = 10

######## acoustic modelling parameters

#transition probabilities weight for multichannel decoding
trans_weight_multi = 25
trans_weight_nbest = 25

###

#flag wether to print the feaures after setting them, or not
print_features = 1

#flag wether to use the bad frames in the sum process
do_avoid_bad_frames = 1

#flag to use the best channel (SNR) as a reference
#defined from command line
do_compute_reference = 1

#flag wether to use a uem file or not(process all the file)
do_use_uem_file = 0

#flag wether to use an adaptative weights scheme or fixed weights
do_adapt_weights = 1

#flag wether to output the sph files or just run the system to create the auxiliary files
do_write_sph_files = 1

####directories where to store/retrieve info####
#channels_file = ./cfg-files/channels

#show needs to be passed as argument normally, here a default one is given just in case
#show_id = Ttmp

2 changes: 2 additions & 0 deletions egs/chime6/s5_track2/conf/mfcc.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
--use-energy=false
--sample-frequency=16000
10 changes: 10 additions & 0 deletions egs/chime6/s5_track2/conf/mfcc_hires.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# config for high-resolution MFCC features, intended for neural network training.
# Note: we keep all cepstra, so it has the same info as filterbank features,
# but MFCC is more easily compressible (because less correlated) which is why
# we prefer this method.
--use-energy=false # use average of log energy, not energy.
--sample-frequency=16000
--num-mel-bins=40
--num-ceps=40
--low-freq=40
--high-freq=-400
1 change: 1 addition & 0 deletions egs/chime6/s5_track2/conf/online_cmvn.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
2 changes: 2 additions & 0 deletions egs/chime6/s5_track2/conf/sad.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
affix=_1a
nnet_type=stats
1 change: 1 addition & 0 deletions egs/chime6/s5_track2/diarization
71 changes: 71 additions & 0 deletions egs/chime6/s5_track2/local/check_tools.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/bin/bash -u

# Copyright 2015 (c) Johns Hopkins University (Jan Trmal <jtrmal@gmail.com>)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

[ -f ./path.sh ] && . ./path.sh

command -v uconv &>/dev/null \
|| { echo >&2 "uconv not found on PATH. You will have to install ICU4C"; exit 1; }

command -v ngram &>/dev/null \
|| { echo >&2 "srilm not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_srilm.sh to install it"; exit 1; }

if [ -z ${LIBLBFGS} ]; then
echo >&2 "SRILM is not compiled with the support of MaxEnt models."
echo >&2 "You should use the script in \$KALDI_ROOT/tools/install_srilm.sh"
echo >&2 "which will take care of compiling the SRILM with MaxEnt support"
exit 1;
fi

sox=`command -v sox 2>/dev/null` \
|| { echo >&2 "sox not found on PATH. Please install it manually (you will need version 14.4.0 and higher)."; exit 1; }

# If sox is found on path, check if the version is correct
if [ ! -z "$sox" ]; then
sox_version=`$sox --version 2>&1| head -1 | sed -e 's?.*: ??' -e 's?.* ??'`
if [[ ! $sox_version =~ v14.4.* ]]; then
echo "Unsupported sox version $sox_version found on path. You will need version v14.4.0 and higher."
exit 1
fi
fi

command -v phonetisaurus-align &>/dev/null \
|| { echo >&2 "Phonetisaurus not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_phonetisaurus.sh to install it"; exit 1; }

command -v BeamformIt &>/dev/null \
|| { echo >&2 "BeamformIt not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_beamformit.sh to install it"; exit 1; }

miniconda_dir=$HOME/miniconda3/
if [ ! -d $miniconda_dir ]; then
echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh'"
fi

# check if WPE is installed
result=`$miniconda_dir/bin/python -c "\
try:
import nara_wpe
print('1')
except ImportError:
print('0')"`

if [ "$result" == "1" ]; then
continue
else
echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh"
exit 1
fi

exit 0
126 changes: 126 additions & 0 deletions egs/chime6/s5_track2/local/decode.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#!/bin/bash
#
# This script decodes raw utterances through the entire pipeline:
# Feature extraction -> SAD -> Diarization -> ASR
#
# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal)
# 2019 Desh Raj, David Snyder, Ashish Arora
# Apache 2.0

# Begin configuration section.
nj=50
decode_nj=20
stage=0
sad_stage=0
diarizer_stage=0
enhancement=
test_sets=
skip_scoring=false
# End configuration section
. ./utils/parse_options.sh

. ./cmd.sh
. ./path.sh
. ./conf/sad.conf

#######################################################################
# Prepare the dev and eval data with dereverberation (WPE) and
# beamforming.
#######################################################################
if [ $stage -le 0 ]; then
# Beamforming using reference arrays
# enhanced WAV directory
enhandir=enhan
dereverb_dir=${PWD}/wav/wpe/

for dset in dev eval; do
for mictype in u01 u02 u03 u04 u06; do
local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 120G" \
${audio_dir}/${dset} \
${dereverb_dir}/${dset} \
${mictype}
done
done

for dset in dev eval; do
for mictype in u01 u02 u03 u04 u06; do
local/run_beamformit.sh --cmd "$train_cmd" \
${dereverb_dir}/${dset} \
${enhandir}/${dset}_${enhancement}_${mictype} \
${mictype}
done
done

for dset in dev eval; do
local/prepare_data.sh --mictype ref --train false \
"$PWD/${enhandir}/${dset}_${enhancement}_u0*" \
${json_dir}/${dset} data/${dset}_${enhancement}_dereverb_ref
done
fi

if [ $stage -le 1 ]; then
# mfccdir should be some place with a largish disk where you
# want to store MFCC features.
mfccdir=mfcc
for x in ${test_sets}; do
steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \
--mfcc-config conf/mfcc_hires.conf \
data/$x exp/make_mfcc/$x $mfccdir
done
fi

#######################################################################
# Perform SAD on the dev/eval data
#######################################################################
dir=exp/segmentation${affix}
sad_work_dir=exp/sad${affix}_${nnet_type}/
sad_nnet_dir=$dir/tdnn_${nnet_type}_sad_1a

if [ $stage -le 2 ]; then
for datadir in ${test_sets}; do
test_set=data/${datadir}
if [ ! -f ${test_set}/wav.scp ]; then
echo "$0: Not performing SAD on ${test_set}"
exit 0
fi
# Perform segmentation
local/segmentation/detect_speech_activity.sh --nj 10 --stage $sad_stage \
$test_set $sad_nnet_dir mfcc $sad_work_dir \
data/${datadir} || exit 1

mv data/${datadir}_seg data/${datadir}_${nnet_type}_seg
# Generate RTTM file from segmentation performed by SAD. This can
# be used to evaluate the performance of the SAD as an intermediate
# step.
steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \
data/${datadir}_${nnet_type}_seg/utt2spk data/${datadir}_${nnet_type}_seg/segments \
data/${datadir}_${nnet_type}_seg/rttm
done
fi

#######################################################################
# Perform diarization on the dev/eval data
#######################################################################
if [ $stage -le 3 ]; then
for datadir in ${test_sets}; do
local/diarize.sh --nj 10 --cmd "$train_cmd" --stage $diarizer_stage \
exp/xvector_nnet_1a \
data/${datadir}_${nnet_type}_seg \
exp/${datadir}_${nnet_type}_seg_diarization
done
fi

#######################################################################
# Decode diarized output using trained chain model
#######################################################################
if [ $stage -le 4 ]; then
continue
fi

#######################################################################
# Score decoded dev/eval sets
#######################################################################
if [ $skip_scoring == "false" ]; then
continue
fi
exit 0;
85 changes: 85 additions & 0 deletions egs/chime6/s5_track2/local/diarize.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/bin/bash
# Copyright 2019 David Snder
# Apache 2.0.
#
# This script takes an input directory that has a segments file (and
# a feats.scp file), and performs diarization on it. The output directory
# contains an RTTM file which can be used to resegment the input data.

stage=0
nj=10
cmd="run.pl"
ref_rttm=

echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
echo "Usage: $0 <model-dir> <in-data-dir> <out-dir>"
echo "e.g.: $0 exp/xvector_nnet_1a data/dev exp/dev_diarization"
echo "Options: "
echo " --nj <nj> # number of parallel jobs."
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --ref-rttm <path to reference RTTM> # if present, used to score output RTTM."
exit 1;
fi

model_dir=$1
data_in=$2
out_dir=$3

name=`basename $data_in`

for f in $data_in/feats.scp $data_in/segments $model_dir/plda \
$model_dir/final.raw $model_dir/extract.config; do
[ ! -f $f ] && echo "$0: No such file $f" && exit 1;
done

if [ $stage -le 0 ]; then
echo "$0: computing features for x-vector extractor"
utils/fix_data_dir.sh data/${name}
rm -rf data/${name}_cmn
local/nnet3/xvector/prepare_feats.sh --nj $nj --cmd "$cmd" \
data/$name data/${name}_cmn exp/${name}_cmn
cp data/$name/segments exp/${name}_cmn/
utils/fix_data_dir.sh data/${name}_cmn
fi

if [ $stage -le 1 ]; then
echo "$0: extracting x-vectors for all segments"
diarization/nnet3/xvector/extract_xvectors.sh --cmd "$cmd" \
--nj $nj --window 1.5 --period 0.75 --apply-cmn false \
--min-segment 0.5 $model_dir \
data/${name}_cmn $out_dir/xvectors_${name}
fi

# Perform PLDA scoring
if [ $stage -le 2 ]; then
# Perform PLDA scoring on all pairs of segments for each recording.
echo "$0: performing PLDA scoring between all pairs of x-vectors"
diarization/nnet3/xvector/score_plda.sh --cmd "$cmd" \
--target-energy 0.5 \
--nj $nj $model_dir/ $out_dir/xvectors_${name} \
$out_dir/xvectors_${name}/plda_scores
fi

if [ $stage -le 3 ]; then
echo "$0: performing clustering using PLDA scores (we assume 4 speakers per recording)"
awk '{print $1, "4"}' data/$name/wav.scp > data/$name/reco2num_spk
diarization/cluster.sh --cmd "$cmd" --nj $nj \
--reco2num-spk data/$name/reco2num_spk \
--rttm-channel 1 \
$out_dir/xvectors_${name}/plda_scores $out_dir
echo "$0: wrote RTTM to output directory ${out_dir}"
fi

if [ $stage -le 4 ]; then
if [ -f $ref_rttm ]; then
echo "$0: computing diariztion error rate (DER) using reference ${ref_rttm}"
mkdir -p $out_dir/tuning/
md-eval.pl -c 0.25 -1 -r $ref_rttm -s $out_dir/rttm 2> $out_dir/log/der.log > $out_dir/der
der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' ${out_dir}/der)
echo "DER: $der%"
fi
fi

Loading

0 comments on commit 87c0781

Please sign in to comment.