forked from kaldi-asr/kaldi
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request kaldi-asr#3 from chimechallenge/sad
Track 2 pipeline with SAD and Diarization
- Loading branch information
Showing
35 changed files
with
2,817 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -77,6 +77,8 @@ GSYMS | |
/egs/*/*/plp | ||
/egs/*/*/exp | ||
/egs/*/*/data | ||
/egs/*/*/wav | ||
/egs/*/*/enhan | ||
|
||
# /tools/ | ||
/tools/pocolm/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
This is a kaldi recipe for the 6th CHiME Speech Separation and Recognition Challenge (CHiME-6). | ||
|
||
See http://spandh.dcs.shef.ac.uk/chime_challenge/ for more detailed information. | ||
|
||
s5_track1 : Track 1 of the challenge (oracle segments and speaker label is provided) | ||
s5_track2 : Track 2 of the challenge (only raw audio is provided) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# you can change cmd.sh depending on what type of queue you are using. | ||
# If you have no queueing system and want to run on a local machine, you | ||
# can change all instances 'queue.pl' to run.pl (but be careful and run | ||
# commands one by one: most recipes will exhaust the memory on your | ||
# machine). queue.pl works with GridEngine (qsub). slurm.pl works | ||
# with slurm. Different queues are configured differently, with different | ||
# queue names and different ways of specifying things like memory; | ||
# to account for these differences you can create and edit the file | ||
# conf/queue.conf to match your queue's configuration. Search for | ||
# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, | ||
# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. | ||
|
||
export train_cmd="retry.pl queue.pl --mem 2G" | ||
export decode_cmd="queue.pl --mem 4G" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/) | ||
|
||
# scrolling size to compute the delays | ||
scroll_size = 250 | ||
|
||
# cross correlation computation window size | ||
window_size = 500 | ||
|
||
#amount of maximum points for the xcorrelation taken into account | ||
nbest_amount = 4 | ||
|
||
#flag wether to apply an automatic noise thresholding | ||
do_noise_threshold = 1 | ||
|
||
#Percentage of frames with lower xcorr taken as noisy | ||
noise_percent = 10 | ||
|
||
######## acoustic modelling parameters | ||
|
||
#transition probabilities weight for multichannel decoding | ||
trans_weight_multi = 25 | ||
trans_weight_nbest = 25 | ||
|
||
### | ||
|
||
#flag wether to print the feaures after setting them, or not | ||
print_features = 1 | ||
|
||
#flag wether to use the bad frames in the sum process | ||
do_avoid_bad_frames = 1 | ||
|
||
#flag to use the best channel (SNR) as a reference | ||
#defined from command line | ||
do_compute_reference = 1 | ||
|
||
#flag wether to use a uem file or not(process all the file) | ||
do_use_uem_file = 0 | ||
|
||
#flag wether to use an adaptative weights scheme or fixed weights | ||
do_adapt_weights = 1 | ||
|
||
#flag wether to output the sph files or just run the system to create the auxiliary files | ||
do_write_sph_files = 1 | ||
|
||
####directories where to store/retrieve info#### | ||
#channels_file = ./cfg-files/channels | ||
|
||
#show needs to be passed as argument normally, here a default one is given just in case | ||
#show_id = Ttmp | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
--use-energy=false | ||
--sample-frequency=16000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# config for high-resolution MFCC features, intended for neural network training. | ||
# Note: we keep all cepstra, so it has the same info as filterbank features, | ||
# but MFCC is more easily compressible (because less correlated) which is why | ||
# we prefer this method. | ||
--use-energy=false # use average of log energy, not energy. | ||
--sample-frequency=16000 | ||
--num-mel-bins=40 | ||
--num-ceps=40 | ||
--low-freq=40 | ||
--high-freq=-400 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
affix=_1a | ||
nnet_type=stats |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../callhome_diarization/v1/diarization |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
#!/bin/bash -u | ||
|
||
# Copyright 2015 (c) Johns Hopkins University (Jan Trmal <jtrmal@gmail.com>) | ||
|
||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED | ||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, | ||
# MERCHANTABLITY OR NON-INFRINGEMENT. | ||
# See the Apache 2 License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
[ -f ./path.sh ] && . ./path.sh | ||
|
||
command -v uconv &>/dev/null \ | ||
|| { echo >&2 "uconv not found on PATH. You will have to install ICU4C"; exit 1; } | ||
|
||
command -v ngram &>/dev/null \ | ||
|| { echo >&2 "srilm not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_srilm.sh to install it"; exit 1; } | ||
|
||
if [ -z ${LIBLBFGS} ]; then | ||
echo >&2 "SRILM is not compiled with the support of MaxEnt models." | ||
echo >&2 "You should use the script in \$KALDI_ROOT/tools/install_srilm.sh" | ||
echo >&2 "which will take care of compiling the SRILM with MaxEnt support" | ||
exit 1; | ||
fi | ||
|
||
sox=`command -v sox 2>/dev/null` \ | ||
|| { echo >&2 "sox not found on PATH. Please install it manually (you will need version 14.4.0 and higher)."; exit 1; } | ||
|
||
# If sox is found on path, check if the version is correct | ||
if [ ! -z "$sox" ]; then | ||
sox_version=`$sox --version 2>&1| head -1 | sed -e 's?.*: ??' -e 's?.* ??'` | ||
if [[ ! $sox_version =~ v14.4.* ]]; then | ||
echo "Unsupported sox version $sox_version found on path. You will need version v14.4.0 and higher." | ||
exit 1 | ||
fi | ||
fi | ||
|
||
command -v phonetisaurus-align &>/dev/null \ | ||
|| { echo >&2 "Phonetisaurus not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_phonetisaurus.sh to install it"; exit 1; } | ||
|
||
command -v BeamformIt &>/dev/null \ | ||
|| { echo >&2 "BeamformIt not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_beamformit.sh to install it"; exit 1; } | ||
|
||
miniconda_dir=$HOME/miniconda3/ | ||
if [ ! -d $miniconda_dir ]; then | ||
echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh'" | ||
fi | ||
|
||
# check if WPE is installed | ||
result=`$miniconda_dir/bin/python -c "\ | ||
try: | ||
import nara_wpe | ||
print('1') | ||
except ImportError: | ||
print('0')"` | ||
|
||
if [ "$result" == "1" ]; then | ||
continue | ||
else | ||
echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh" | ||
exit 1 | ||
fi | ||
|
||
exit 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
#!/bin/bash | ||
# | ||
# This script decodes raw utterances through the entire pipeline: | ||
# Feature extraction -> SAD -> Diarization -> ASR | ||
# | ||
# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal) | ||
# 2019 Desh Raj, David Snyder, Ashish Arora | ||
# Apache 2.0 | ||
|
||
# Begin configuration section. | ||
nj=50 | ||
decode_nj=20 | ||
stage=0 | ||
sad_stage=0 | ||
diarizer_stage=0 | ||
enhancement= | ||
test_sets= | ||
skip_scoring=false | ||
# End configuration section | ||
. ./utils/parse_options.sh | ||
|
||
. ./cmd.sh | ||
. ./path.sh | ||
. ./conf/sad.conf | ||
|
||
####################################################################### | ||
# Prepare the dev and eval data with dereverberation (WPE) and | ||
# beamforming. | ||
####################################################################### | ||
if [ $stage -le 0 ]; then | ||
# Beamforming using reference arrays | ||
# enhanced WAV directory | ||
enhandir=enhan | ||
dereverb_dir=${PWD}/wav/wpe/ | ||
|
||
for dset in dev eval; do | ||
for mictype in u01 u02 u03 u04 u06; do | ||
local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 120G" \ | ||
${audio_dir}/${dset} \ | ||
${dereverb_dir}/${dset} \ | ||
${mictype} | ||
done | ||
done | ||
|
||
for dset in dev eval; do | ||
for mictype in u01 u02 u03 u04 u06; do | ||
local/run_beamformit.sh --cmd "$train_cmd" \ | ||
${dereverb_dir}/${dset} \ | ||
${enhandir}/${dset}_${enhancement}_${mictype} \ | ||
${mictype} | ||
done | ||
done | ||
|
||
for dset in dev eval; do | ||
local/prepare_data.sh --mictype ref --train false \ | ||
"$PWD/${enhandir}/${dset}_${enhancement}_u0*" \ | ||
${json_dir}/${dset} data/${dset}_${enhancement}_dereverb_ref | ||
done | ||
fi | ||
|
||
if [ $stage -le 1 ]; then | ||
# mfccdir should be some place with a largish disk where you | ||
# want to store MFCC features. | ||
mfccdir=mfcc | ||
for x in ${test_sets}; do | ||
steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \ | ||
--mfcc-config conf/mfcc_hires.conf \ | ||
data/$x exp/make_mfcc/$x $mfccdir | ||
done | ||
fi | ||
|
||
####################################################################### | ||
# Perform SAD on the dev/eval data | ||
####################################################################### | ||
dir=exp/segmentation${affix} | ||
sad_work_dir=exp/sad${affix}_${nnet_type}/ | ||
sad_nnet_dir=$dir/tdnn_${nnet_type}_sad_1a | ||
|
||
if [ $stage -le 2 ]; then | ||
for datadir in ${test_sets}; do | ||
test_set=data/${datadir} | ||
if [ ! -f ${test_set}/wav.scp ]; then | ||
echo "$0: Not performing SAD on ${test_set}" | ||
exit 0 | ||
fi | ||
# Perform segmentation | ||
local/segmentation/detect_speech_activity.sh --nj 10 --stage $sad_stage \ | ||
$test_set $sad_nnet_dir mfcc $sad_work_dir \ | ||
data/${datadir} || exit 1 | ||
|
||
mv data/${datadir}_seg data/${datadir}_${nnet_type}_seg | ||
# Generate RTTM file from segmentation performed by SAD. This can | ||
# be used to evaluate the performance of the SAD as an intermediate | ||
# step. | ||
steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ | ||
data/${datadir}_${nnet_type}_seg/utt2spk data/${datadir}_${nnet_type}_seg/segments \ | ||
data/${datadir}_${nnet_type}_seg/rttm | ||
done | ||
fi | ||
|
||
####################################################################### | ||
# Perform diarization on the dev/eval data | ||
####################################################################### | ||
if [ $stage -le 3 ]; then | ||
for datadir in ${test_sets}; do | ||
local/diarize.sh --nj 10 --cmd "$train_cmd" --stage $diarizer_stage \ | ||
exp/xvector_nnet_1a \ | ||
data/${datadir}_${nnet_type}_seg \ | ||
exp/${datadir}_${nnet_type}_seg_diarization | ||
done | ||
fi | ||
|
||
####################################################################### | ||
# Decode diarized output using trained chain model | ||
####################################################################### | ||
if [ $stage -le 4 ]; then | ||
continue | ||
fi | ||
|
||
####################################################################### | ||
# Score decoded dev/eval sets | ||
####################################################################### | ||
if [ $skip_scoring == "false" ]; then | ||
continue | ||
fi | ||
exit 0; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
#!/bin/bash | ||
# Copyright 2019 David Snder | ||
# Apache 2.0. | ||
# | ||
# This script takes an input directory that has a segments file (and | ||
# a feats.scp file), and performs diarization on it. The output directory | ||
# contains an RTTM file which can be used to resegment the input data. | ||
|
||
stage=0 | ||
nj=10 | ||
cmd="run.pl" | ||
ref_rttm= | ||
|
||
echo "$0 $@" # Print the command line for logging | ||
if [ -f path.sh ]; then . ./path.sh; fi | ||
. parse_options.sh || exit 1; | ||
if [ $# != 3 ]; then | ||
echo "Usage: $0 <model-dir> <in-data-dir> <out-dir>" | ||
echo "e.g.: $0 exp/xvector_nnet_1a data/dev exp/dev_diarization" | ||
echo "Options: " | ||
echo " --nj <nj> # number of parallel jobs." | ||
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs." | ||
echo " --ref-rttm <path to reference RTTM> # if present, used to score output RTTM." | ||
exit 1; | ||
fi | ||
|
||
model_dir=$1 | ||
data_in=$2 | ||
out_dir=$3 | ||
|
||
name=`basename $data_in` | ||
|
||
for f in $data_in/feats.scp $data_in/segments $model_dir/plda \ | ||
$model_dir/final.raw $model_dir/extract.config; do | ||
[ ! -f $f ] && echo "$0: No such file $f" && exit 1; | ||
done | ||
|
||
if [ $stage -le 0 ]; then | ||
echo "$0: computing features for x-vector extractor" | ||
utils/fix_data_dir.sh data/${name} | ||
rm -rf data/${name}_cmn | ||
local/nnet3/xvector/prepare_feats.sh --nj $nj --cmd "$cmd" \ | ||
data/$name data/${name}_cmn exp/${name}_cmn | ||
cp data/$name/segments exp/${name}_cmn/ | ||
utils/fix_data_dir.sh data/${name}_cmn | ||
fi | ||
|
||
if [ $stage -le 1 ]; then | ||
echo "$0: extracting x-vectors for all segments" | ||
diarization/nnet3/xvector/extract_xvectors.sh --cmd "$cmd" \ | ||
--nj $nj --window 1.5 --period 0.75 --apply-cmn false \ | ||
--min-segment 0.5 $model_dir \ | ||
data/${name}_cmn $out_dir/xvectors_${name} | ||
fi | ||
|
||
# Perform PLDA scoring | ||
if [ $stage -le 2 ]; then | ||
# Perform PLDA scoring on all pairs of segments for each recording. | ||
echo "$0: performing PLDA scoring between all pairs of x-vectors" | ||
diarization/nnet3/xvector/score_plda.sh --cmd "$cmd" \ | ||
--target-energy 0.5 \ | ||
--nj $nj $model_dir/ $out_dir/xvectors_${name} \ | ||
$out_dir/xvectors_${name}/plda_scores | ||
fi | ||
|
||
if [ $stage -le 3 ]; then | ||
echo "$0: performing clustering using PLDA scores (we assume 4 speakers per recording)" | ||
awk '{print $1, "4"}' data/$name/wav.scp > data/$name/reco2num_spk | ||
diarization/cluster.sh --cmd "$cmd" --nj $nj \ | ||
--reco2num-spk data/$name/reco2num_spk \ | ||
--rttm-channel 1 \ | ||
$out_dir/xvectors_${name}/plda_scores $out_dir | ||
echo "$0: wrote RTTM to output directory ${out_dir}" | ||
fi | ||
|
||
if [ $stage -le 4 ]; then | ||
if [ -f $ref_rttm ]; then | ||
echo "$0: computing diariztion error rate (DER) using reference ${ref_rttm}" | ||
mkdir -p $out_dir/tuning/ | ||
md-eval.pl -c 0.25 -1 -r $ref_rttm -s $out_dir/rttm 2> $out_dir/log/der.log > $out_dir/der | ||
der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' ${out_dir}/der) | ||
echo "DER: $der%" | ||
fi | ||
fi | ||
|
Oops, something went wrong.