From 02eb1d607ff3ef37e453f70f23e740570dd52c7d Mon Sep 17 00:00:00 2001 From: medennikov Date: Wed, 20 May 2020 13:34:03 +0300 Subject: [PATCH 01/10] initial commit --- egs/chime6/s5b_track2/RESULTS | 23 ++ egs/chime6/s5b_track2/cmd.sh | 14 + egs/chime6/s5b_track2/conf/beamformit.cfg | 50 +++ egs/chime6/s5b_track2/conf/mfcc.conf | 2 + egs/chime6/s5b_track2/conf/mfcc_hires.conf | 10 + egs/chime6/s5b_track2/conf/online_cmvn.conf | 1 + egs/chime6/s5b_track2/conf/sad.conf | 2 + egs/chime6/s5b_track2/diarization | 1 + egs/chime6/s5b_track2/local/chain | 1 + .../s5b_track2/local/check_dset_error.py | 69 ++++ egs/chime6/s5b_track2/local/check_tools.sh | 1 + .../convert_rttm_to_utt2spk_and_segments.py | 98 ++++++ .../s5b_track2/local/copy_lat_dir_parallel.sh | 1 + egs/chime6/s5b_track2/local/decode.sh | 217 +++++++++++++ .../s5b_track2/local/decode_diarized.sh | 74 +++++ egs/chime6/s5b_track2/local/diarize.sh | 119 +++++++ .../s5b_track2/local/distant_audio_list | 1 + egs/chime6/s5b_track2/local/extract_noises.py | 1 + .../s5b_track2/local/extract_vad_weights.sh | 1 + .../s5b_track2/local/gen_aligned_hyp.py | 43 +++ .../s5b_track2/local/generate_chime6_data.sh | 1 + egs/chime6/s5b_track2/local/get_best_error.py | 84 +++++ .../local/get_hyp_perspeaker_perarray_file.py | 63 ++++ .../get_ref_perspeaker_persession_file.py | 86 +++++ egs/chime6/s5b_track2/local/install_dscore.sh | 8 + .../s5b_track2/local/install_pb_chime5.sh | 1 + egs/chime6/s5b_track2/local/json2text.py | 1 + .../s5b_track2/local/make_noise_list.py | 1 + egs/chime6/s5b_track2/local/make_voxceleb1.pl | 130 ++++++++ egs/chime6/s5b_track2/local/make_voxceleb2.pl | 70 ++++ .../s5b_track2/local/multispeaker_score.sh | 144 +++++++++ .../s5b_track2/local/nnet3/compare_wer.sh | 1 + egs/chime6/s5b_track2/local/nnet3/decode.sh | 1 + .../local/nnet3/run_ivector_common.sh | 1 + .../local/nnet3/xvector/prepare_feats.sh | 89 ++++++ .../nnet3/xvector/prepare_feats_for_egs.sh | 83 +++++ .../local/nnet3/xvector/run_xvector.sh | 1 + .../nnet3/xvector/tuning/run_xvector_1a.sh | 149 +++++++++ egs/chime6/s5b_track2/local/prepare_data.sh | 149 +++++++++ egs/chime6/s5b_track2/local/prepare_dict.sh | 1 + .../s5b_track2/local/print_dset_error.py | 35 ++ .../s5b_track2/local/reverberate_lat_dir.sh | 1 + egs/chime6/s5b_track2/local/run_beamformit.sh | 1 + .../s5b_track2/local/run_ivector_common.sh | 1 + egs/chime6/s5b_track2/local/run_wpe.py | 1 + egs/chime6/s5b_track2/local/run_wpe.sh | 1 + egs/chime6/s5b_track2/local/score.sh | 1 + .../s5b_track2/local/score_for_submit.sh | 102 ++++++ .../segmentation/detect_speech_activity.sh | 217 +++++++++++++ .../segmentation/tuning/train_lstm_sad_1a.sh | 140 ++++++++ .../segmentation/tuning/train_stats_sad_1a.sh | 150 +++++++++ egs/chime6/s5b_track2/local/train_diarizer.sh | 186 +++++++++++ .../s5b_track2/local/train_lms_srilm.sh | 1 + egs/chime6/s5b_track2/local/train_sad.sh | 155 +++++++++ egs/chime6/s5b_track2/local/truncate_rttm.py | 39 +++ egs/chime6/s5b_track2/local/uem_file | 20 ++ egs/chime6/s5b_track2/local/wer_output_filter | 1 + egs/chime6/s5b_track2/path.sh | 9 + egs/chime6/s5b_track2/run.sh | 300 ++++++++++++++++++ egs/chime6/s5b_track2/sid | 1 + egs/chime6/s5b_track2/steps | 1 + egs/chime6/s5b_track2/utils | 1 + 62 files changed, 3157 insertions(+) create mode 100644 egs/chime6/s5b_track2/RESULTS create mode 100644 egs/chime6/s5b_track2/cmd.sh create mode 100755 egs/chime6/s5b_track2/conf/beamformit.cfg create mode 100644 egs/chime6/s5b_track2/conf/mfcc.conf create mode 100644 egs/chime6/s5b_track2/conf/mfcc_hires.conf create mode 100644 egs/chime6/s5b_track2/conf/online_cmvn.conf create mode 100644 egs/chime6/s5b_track2/conf/sad.conf create mode 120000 egs/chime6/s5b_track2/diarization create mode 120000 egs/chime6/s5b_track2/local/chain create mode 100755 egs/chime6/s5b_track2/local/check_dset_error.py create mode 120000 egs/chime6/s5b_track2/local/check_tools.sh create mode 100755 egs/chime6/s5b_track2/local/convert_rttm_to_utt2spk_and_segments.py create mode 120000 egs/chime6/s5b_track2/local/copy_lat_dir_parallel.sh create mode 100755 egs/chime6/s5b_track2/local/decode.sh create mode 100755 egs/chime6/s5b_track2/local/decode_diarized.sh create mode 100755 egs/chime6/s5b_track2/local/diarize.sh create mode 120000 egs/chime6/s5b_track2/local/distant_audio_list create mode 120000 egs/chime6/s5b_track2/local/extract_noises.py create mode 120000 egs/chime6/s5b_track2/local/extract_vad_weights.sh create mode 100755 egs/chime6/s5b_track2/local/gen_aligned_hyp.py create mode 120000 egs/chime6/s5b_track2/local/generate_chime6_data.sh create mode 100755 egs/chime6/s5b_track2/local/get_best_error.py create mode 100755 egs/chime6/s5b_track2/local/get_hyp_perspeaker_perarray_file.py create mode 100755 egs/chime6/s5b_track2/local/get_ref_perspeaker_persession_file.py create mode 100755 egs/chime6/s5b_track2/local/install_dscore.sh create mode 120000 egs/chime6/s5b_track2/local/install_pb_chime5.sh create mode 120000 egs/chime6/s5b_track2/local/json2text.py create mode 120000 egs/chime6/s5b_track2/local/make_noise_list.py create mode 100755 egs/chime6/s5b_track2/local/make_voxceleb1.pl create mode 100755 egs/chime6/s5b_track2/local/make_voxceleb2.pl create mode 100755 egs/chime6/s5b_track2/local/multispeaker_score.sh create mode 120000 egs/chime6/s5b_track2/local/nnet3/compare_wer.sh create mode 120000 egs/chime6/s5b_track2/local/nnet3/decode.sh create mode 120000 egs/chime6/s5b_track2/local/nnet3/run_ivector_common.sh create mode 100755 egs/chime6/s5b_track2/local/nnet3/xvector/prepare_feats.sh create mode 100755 egs/chime6/s5b_track2/local/nnet3/xvector/prepare_feats_for_egs.sh create mode 120000 egs/chime6/s5b_track2/local/nnet3/xvector/run_xvector.sh create mode 100755 egs/chime6/s5b_track2/local/nnet3/xvector/tuning/run_xvector_1a.sh create mode 100755 egs/chime6/s5b_track2/local/prepare_data.sh create mode 120000 egs/chime6/s5b_track2/local/prepare_dict.sh create mode 100755 egs/chime6/s5b_track2/local/print_dset_error.py create mode 120000 egs/chime6/s5b_track2/local/reverberate_lat_dir.sh create mode 120000 egs/chime6/s5b_track2/local/run_beamformit.sh create mode 120000 egs/chime6/s5b_track2/local/run_ivector_common.sh create mode 120000 egs/chime6/s5b_track2/local/run_wpe.py create mode 120000 egs/chime6/s5b_track2/local/run_wpe.sh create mode 120000 egs/chime6/s5b_track2/local/score.sh create mode 100755 egs/chime6/s5b_track2/local/score_for_submit.sh create mode 100755 egs/chime6/s5b_track2/local/segmentation/detect_speech_activity.sh create mode 100755 egs/chime6/s5b_track2/local/segmentation/tuning/train_lstm_sad_1a.sh create mode 100755 egs/chime6/s5b_track2/local/segmentation/tuning/train_stats_sad_1a.sh create mode 100755 egs/chime6/s5b_track2/local/train_diarizer.sh create mode 120000 egs/chime6/s5b_track2/local/train_lms_srilm.sh create mode 100755 egs/chime6/s5b_track2/local/train_sad.sh create mode 100755 egs/chime6/s5b_track2/local/truncate_rttm.py create mode 100644 egs/chime6/s5b_track2/local/uem_file create mode 120000 egs/chime6/s5b_track2/local/wer_output_filter create mode 100644 egs/chime6/s5b_track2/path.sh create mode 100755 egs/chime6/s5b_track2/run.sh create mode 120000 egs/chime6/s5b_track2/sid create mode 120000 egs/chime6/s5b_track2/steps create mode 120000 egs/chime6/s5b_track2/utils diff --git a/egs/chime6/s5b_track2/RESULTS b/egs/chime6/s5b_track2/RESULTS new file mode 100644 index 00000000000..131b43cecf8 --- /dev/null +++ b/egs/chime6/s5b_track2/RESULTS @@ -0,0 +1,23 @@ +# Results for Chime-6 track 2 for dev and eval, using pretrained models +# available at http://kaldi-asr.org/models/m12. + +# These results are reported only for array U06, which is the default +# array selection method in the baseline system. + +# Speech Activity Detection (SAD) + Missed speech False alarm Total error +Dev (old RTTM) 2.5 0.8 3.3 +Dev (new RTTM) 1.9 0.7 2.6 +Eval (old RTTM) 4.1 1.8 5.9 +Eval (new RTTM) 4.3 1.5 5.8 + +# Diarization + DER JER +Dev (old RTTM) 61.56 69.75 +Dev (new RTTM) 63.42 70.83 +Eval (old RTTM) 61.96 71.40 +Eval (new RTTM) 68.20 72.54 + +# ASR nnet3 tdnn+chain +Dev: %WER 84.25 [ 49610 / 58881, 1937 ins, 34685 del, 12988 sub ] +Eval: %WER 77.94 [ 42971 / 55132, 1086 ins, 30839 del, 11046 sub ] diff --git a/egs/chime6/s5b_track2/cmd.sh b/egs/chime6/s5b_track2/cmd.sh new file mode 100644 index 00000000000..86514d94d4d --- /dev/null +++ b/egs/chime6/s5b_track2/cmd.sh @@ -0,0 +1,14 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="retry.pl queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" diff --git a/egs/chime6/s5b_track2/conf/beamformit.cfg b/egs/chime6/s5b_track2/conf/beamformit.cfg new file mode 100755 index 00000000000..70fdd858651 --- /dev/null +++ b/egs/chime6/s5b_track2/conf/beamformit.cfg @@ -0,0 +1,50 @@ +#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/) + +# scrolling size to compute the delays +scroll_size = 250 + +# cross correlation computation window size +window_size = 500 + +#amount of maximum points for the xcorrelation taken into account +nbest_amount = 4 + +#flag wether to apply an automatic noise thresholding +do_noise_threshold = 1 + +#Percentage of frames with lower xcorr taken as noisy +noise_percent = 10 + +######## acoustic modelling parameters + +#transition probabilities weight for multichannel decoding +trans_weight_multi = 25 +trans_weight_nbest = 25 + +### + +#flag wether to print the feaures after setting them, or not +print_features = 1 + +#flag wether to use the bad frames in the sum process +do_avoid_bad_frames = 1 + +#flag to use the best channel (SNR) as a reference +#defined from command line +do_compute_reference = 1 + +#flag wether to use a uem file or not(process all the file) +do_use_uem_file = 0 + +#flag wether to use an adaptative weights scheme or fixed weights +do_adapt_weights = 1 + +#flag wether to output the sph files or just run the system to create the auxiliary files +do_write_sph_files = 1 + +####directories where to store/retrieve info#### +#channels_file = ./cfg-files/channels + +#show needs to be passed as argument normally, here a default one is given just in case +#show_id = Ttmp + diff --git a/egs/chime6/s5b_track2/conf/mfcc.conf b/egs/chime6/s5b_track2/conf/mfcc.conf new file mode 100644 index 00000000000..32988403b00 --- /dev/null +++ b/egs/chime6/s5b_track2/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false +--sample-frequency=16000 diff --git a/egs/chime6/s5b_track2/conf/mfcc_hires.conf b/egs/chime6/s5b_track2/conf/mfcc_hires.conf new file mode 100644 index 00000000000..fd64b62eb16 --- /dev/null +++ b/egs/chime6/s5b_track2/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=16000 +--num-mel-bins=40 +--num-ceps=40 +--low-freq=40 +--high-freq=-400 diff --git a/egs/chime6/s5b_track2/conf/online_cmvn.conf b/egs/chime6/s5b_track2/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/chime6/s5b_track2/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/chime6/s5b_track2/conf/sad.conf b/egs/chime6/s5b_track2/conf/sad.conf new file mode 100644 index 00000000000..752bb1cf6c5 --- /dev/null +++ b/egs/chime6/s5b_track2/conf/sad.conf @@ -0,0 +1,2 @@ +affix=_1a +nnet_type=stats diff --git a/egs/chime6/s5b_track2/diarization b/egs/chime6/s5b_track2/diarization new file mode 120000 index 00000000000..bad937c1444 --- /dev/null +++ b/egs/chime6/s5b_track2/diarization @@ -0,0 +1 @@ +../../callhome_diarization/v1/diarization \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/chain b/egs/chime6/s5b_track2/local/chain new file mode 120000 index 00000000000..dd7910711d1 --- /dev/null +++ b/egs/chime6/s5b_track2/local/chain @@ -0,0 +1 @@ +../../s5_track1/local/chain/ \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/check_dset_error.py b/egs/chime6/s5b_track2/local/check_dset_error.py new file mode 100755 index 00000000000..0ed7f59ae83 --- /dev/null +++ b/egs/chime6/s5b_track2/local/check_dset_error.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +# Copyright 2019 Ashish Arora +# Apache 2.0. + +import argparse +import sys, os +import string + +def get_args(): + parser = argparse.ArgumentParser( + description="""This script splits a kaldi text file + into per_speaker per_session text files""") + parser.add_argument("wer_dir_path", type=str, + help="path of directory containing wer files") + parser.add_argument("output_dir_path", type=str, + help="path of the directory containing per speaker output files") + args = parser.parse_args() + return args + +def get_results(filename): + with open(filename) as f: + first_line = f.readline() + parts = first_line.strip().split(',') + total_words = parts[0].split()[-1] + ins = parts[1].split()[0] + deletion = parts[2].split()[0] + sub = parts[3].split()[0] + return int(total_words), int(ins), int(deletion), int(sub) + +def main(): + args = get_args() + recodingid_error_dict={} + min_wer_per_recording = os.path.join(args.wer_dir_path, 'all.txt') + for line in open(min_wer_per_recording, 'r', encoding='utf8'): + toks = line.strip().split() + recordingid = toks[1] + total_words = toks[-5][:-1] + total_errors = toks[-4][:-1] + total_ins = toks[-3][:-1] + total_del = toks[-2][:-1] + total_sub = toks[-1] + recodingid_error_dict[recordingid]=(total_words, total_errors, total_ins, total_del, total_sub) + + recording_spkorder_file = os.path.join(args.output_dir_path, 'recordinid_spkorder') + for line in open(recording_spkorder_file, 'r', encoding='utf8'): + parts = line.strip().split(':') + recordingid = parts[0] + spkorder = parts[1] + spkorder_list=spkorder.split('_') + num_speakers=len(spkorder_list) + total_errors=total_words=total_ins=total_del=total_sub=0 + for i in range(1, num_speakers+1): + filename = 'wer_' + recordingid + '_' + 'r' + str(i)+ 'h' + str(spkorder_list[i-1]) + wer_filename = os.path.join(args.wer_dir_path, filename) + words, ins, deletion, sub = get_results(wer_filename) + total_words += words + total_ins += ins + total_del += deletion + total_sub += sub + total_errors += ins + deletion + sub + assert int(total_words) == int(recodingid_error_dict[recordingid][0]), "Total words mismatch" + assert int(total_errors) == int(recodingid_error_dict[recordingid][1]), "Total errors mismatch" + assert int(total_ins) == int(recodingid_error_dict[recordingid][2]), "Total insertions mismatch" + assert int(total_del) == int(recodingid_error_dict[recordingid][3]), "Total deletions mismatch" + assert int(total_sub) == int(recodingid_error_dict[recordingid][4]), "Total substitutions mismatch" + + +if __name__ == '__main__': + main() diff --git a/egs/chime6/s5b_track2/local/check_tools.sh b/egs/chime6/s5b_track2/local/check_tools.sh new file mode 120000 index 00000000000..4e835e887f2 --- /dev/null +++ b/egs/chime6/s5b_track2/local/check_tools.sh @@ -0,0 +1 @@ +../../s5_track1/local/check_tools.sh \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/convert_rttm_to_utt2spk_and_segments.py b/egs/chime6/s5b_track2/local/convert_rttm_to_utt2spk_and_segments.py new file mode 100755 index 00000000000..410dced190c --- /dev/null +++ b/egs/chime6/s5b_track2/local/convert_rttm_to_utt2spk_and_segments.py @@ -0,0 +1,98 @@ +#! /usr/bin/env python +# Copyright 2019 Vimal Manohar +# Apache 2.0. + +"""This script converts an RTTM with +speaker info into kaldi utt2spk and segments""" + +import argparse + +def get_args(): + parser = argparse.ArgumentParser( + description="""This script converts an RTTM with + speaker info into kaldi utt2spk and segments""") + parser.add_argument("--use-reco-id-as-spkr", type=str, + choices=["true", "false"], default="false", + help="Use the recording ID based on RTTM and " + "reco2file_and_channel as the speaker") + parser.add_argument("--append-reco-id-to-spkr", type=str, + choices=["true", "false"], default="false", + help="Append recording ID to the speaker ID") + + parser.add_argument("rttm_file", type=str, + help="""Input RTTM file. + The format of the RTTM file is + """ + """ """) + parser.add_argument("reco2file_and_channel", type=str, + help="""Input reco2file_and_channel. + The format is .""") + parser.add_argument("utt2spk", type=str, + help="Output utt2spk file") + parser.add_argument("segments", type=str, + help="Output segments file") + + args = parser.parse_args() + + args.use_reco_id_as_spkr = bool(args.use_reco_id_as_spkr == "true") + args.append_reco_id_to_spkr = bool(args.append_reco_id_to_spkr == "true") + + if args.use_reco_id_as_spkr: + if args.append_reco_id_to_spkr: + raise Exception("Appending recording ID to speaker does not make sense when using --use-reco-id-as-spkr=true") + + return args + +def main(): + args = get_args() + + file_and_channel2reco = {} + utt2spk={} + segments={} + for line in open(args.reco2file_and_channel): + parts = line.strip().split() + file_and_channel2reco[(parts[1], parts[2])] = parts[0] + + utt2spk_writer = open(args.utt2spk, 'w') + segments_writer = open(args.segments, 'w') + for line in open(args.rttm_file): + parts = line.strip().split() + if parts[0] != "SPEAKER": + continue + + file_id = parts[1] + channel = parts[2] + + try: + reco = file_and_channel2reco[(file_id, channel)] + except KeyError as e: + raise Exception("Could not find recording with " + "(file_id, channel) " + "= ({0},{1}) in {2}: {3}\n".format( + file_id, channel, + args.reco2file_and_channel, str(e))) + + start_time = float(parts[3]) + end_time = start_time + float(parts[4]) + + if args.use_reco_id_as_spkr: + spkr = reco + else: + if args.append_reco_id_to_spkr: + spkr = reco + "-" + parts[7] + else: + spkr = parts[7] + + st = int(start_time * 100) + end = int(end_time * 100) + utt = "{0}-{1:06d}-{2:06d}".format(spkr, st, end) + utt2spk[utt]=spkr + segments[utt]=(reco, start_time, end_time) + + for uttid_id in sorted(utt2spk): + utt2spk_writer.write("{0} {1}\n".format(uttid_id, utt2spk[uttid_id])) + segments_writer.write("{0} {1} {2:7.2f} {3:7.2f}\n".format( + uttid_id, segments[uttid_id][0], segments[uttid_id][1], segments[uttid_id][2])) + +if __name__ == '__main__': + main() diff --git a/egs/chime6/s5b_track2/local/copy_lat_dir_parallel.sh b/egs/chime6/s5b_track2/local/copy_lat_dir_parallel.sh new file mode 120000 index 00000000000..a168a917d92 --- /dev/null +++ b/egs/chime6/s5b_track2/local/copy_lat_dir_parallel.sh @@ -0,0 +1 @@ +../../s5_track1/local/copy_lat_dir_parallel.sh \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/decode.sh b/egs/chime6/s5b_track2/local/decode.sh new file mode 100755 index 00000000000..8f094f5c4df --- /dev/null +++ b/egs/chime6/s5b_track2/local/decode.sh @@ -0,0 +1,217 @@ +#!/usr/bin/env bash +# +# This script decodes raw utterances through the entire pipeline: +# Feature extraction -> SAD -> Diarization -> ASR +# +# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal) +# 2019 Desh Raj, David Snyder, Ashish Arora, Zhaoheng Ni +# Apache 2.0 + +# Begin configuration section. +nj=8 +stage=0 +sad_stage=0 +score_sad=true +diarizer_stage=0 +decode_diarize_stage=0 +score_stage=0 + +enhancement=beamformit + +# option to use the new RTTM reference for sad and diarization +use_new_rttm_reference=false +if $use_new_rttm_reference == "true"; then + git clone https://github.com/nateanl/chime6_rttm +fi + +# chime5 main directory path +# please change the path accordingly +chime5_corpus=/export/corpora4/CHiME5 +# chime6 data directories, which are generated from ${chime5_corpus}, +# to synchronize audio files across arrays and modify the annotation (JSON) file accordingly +chime6_corpus=${PWD}/CHiME6 +json_dir=${chime6_corpus}/transcriptions +audio_dir=${chime6_corpus}/audio + +enhanced_dir=enhanced +enhanced_dir=$(utils/make_absolute.sh $enhanced_dir) || exit 1 + +# training data +train_set=train_worn_simu_u400k +test_sets="dev_${enhancement}_dereverb eval_${enhancement}_dereverb" + +. ./utils/parse_options.sh + +. ./cmd.sh +. ./path.sh +. ./conf/sad.conf + +# This script also needs the phonetisaurus g2p, srilm, beamformit +./local/check_tools.sh || exit 1 + +########################################################################### +# We first generate the synchronized audio files across arrays and +# corresponding JSON files. Note that this requires sox v14.4.2, +# which is installed via miniconda in ./local/check_tools.sh +########################################################################### + +if [ $stage -le 0 ]; then + local/generate_chime6_data.sh \ + --cmd "$train_cmd" \ + ${chime5_corpus} \ + ${chime6_corpus} +fi + +####################################################################### +# Prepare the dev and eval data with dereverberation (WPE) and +# beamforming. +####################################################################### +if [ $stage -le 1 ]; then + # Beamforming using reference arrays + # enhanced WAV directory + enhandir=enhan + dereverb_dir=${PWD}/wav/wpe/ + + for dset in dev eval; do + for mictype in u01 u02 u03 u04 u06; do + local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 20G" \ + ${audio_dir}/${dset} \ + ${dereverb_dir}/${dset} \ + ${mictype} + done + done + + for dset in dev eval; do + for mictype in u01 u02 u03 u04 u06; do + local/run_beamformit.sh --cmd "$train_cmd" \ + ${dereverb_dir}/${dset} \ + ${enhandir}/${dset}_${enhancement}_${mictype} \ + ${mictype} + done + done + + # Note that for the evaluation sets, we use the flag + # "--train false". This keeps the files segments, text, + # and utt2spk with .bak extensions, so that they can + # be used later for scoring if needed but are not used + # in the intermediate stages. + for dset in dev eval; do + local/prepare_data.sh --mictype ref --train false \ + "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \ + ${json_dir}/${dset} data/${dset}_${enhancement}_dereverb + done + +fi + +if [ $stage -le 2 ]; then + # mfccdir should be some place with a largish disk where you + # want to store MFCC features. + mfccdir=mfcc + for x in ${test_sets}; do + steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \ + --mfcc-config conf/mfcc_hires.conf \ + data/$x exp/make_mfcc/$x $mfccdir + done +fi + +####################################################################### +# Perform SAD on the dev/eval data +####################################################################### +dir=exp/segmentation${affix} +sad_work_dir=exp/sad${affix}_${nnet_type}/ +sad_nnet_dir=$dir/tdnn_${nnet_type}_sad_1a + +if [ $stage -le 3 ]; then + for datadir in ${test_sets}; do + test_set=data/${datadir} + if [ ! -f ${test_set}/wav.scp ]; then + echo "$0: Not performing SAD on ${test_set}" + exit 0 + fi + # Perform segmentation + local/segmentation/detect_speech_activity.sh --nj $nj --stage $sad_stage \ + $test_set $sad_nnet_dir mfcc $sad_work_dir \ + data/${datadir} || exit 1 + + test_dir=data/${datadir}_${nnet_type}_seg + mv data/${datadir}_seg ${test_dir}/ + cp data/${datadir}/{segments.bak,utt2spk.bak} ${test_dir}/ + # Generate RTTM file from segmentation performed by SAD. This can + # be used to evaluate the performance of the SAD as an intermediate + # step. + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ + ${test_dir}/utt2spk ${test_dir}/segments ${test_dir}/rttm + + if [ $score_sad == "true" ]; then + echo "Scoring $datadir.." + # We first generate the reference RTTM from the backed up utt2spk and segments + # files. + ref_rttm=${test_dir}/ref_rttm + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py ${test_dir}/utt2spk.bak \ + ${test_dir}/segments.bak ${test_dir}/ref_rttm + + # To score, we select just U06 segments from the hypothesis RTTM. + hyp_rttm=${test_dir}/rttm.U06 + grep 'U06' ${test_dir}/rttm > ${test_dir}/rttm.U06 + echo "Array U06 selected for scoring.." + + if $use_new_rttm_reference == "true"; then + echo "Use the new RTTM reference." + mode="$(cut -d'_' -f1 <<<"$datadir")" + ref_rttm=./chime6_rttm/${mode}_rttm + fi + + sed 's/_U0[1-6].ENH//g' $ref_rttm > $ref_rttm.scoring + sed 's/_U0[1-6].ENH//g' $hyp_rttm > $hyp_rttm.scoring + cat ./local/uem_file | grep 'U06' | sed 's/_U0[1-6]//g' > ./local/uem_file.tmp + md-eval.pl -1 -c 0.25 -u ./local/uem_file.tmp -r $ref_rttm.scoring -s $hyp_rttm.scoring |\ + awk 'or(/MISSED SPEECH/,/FALARM SPEECH/)' + fi + done +fi + +####################################################################### +# Perform diarization on the dev/eval data +####################################################################### +if [ $stage -le 4 ]; then + for datadir in ${test_sets}; do + if $use_new_rttm_reference == "true"; then + mode="$(cut -d'_' -f1 <<<"$datadir")" + ref_rttm=./chime6_rttm/${mode}_rttm + else + ref_rttm=data/${datadir}_${nnet_type}_seg/ref_rttm + fi + local/diarize.sh --nj $nj --cmd "$train_cmd" --stage $diarizer_stage \ + --ref-rttm $ref_rttm \ + exp/xvector_nnet_1a \ + data/${datadir}_${nnet_type}_seg \ + exp/${datadir}_${nnet_type}_seg_diarization + done +fi + +####################################################################### +# Decode diarized output using trained chain model +####################################################################### +if [ $stage -le 5 ]; then + for datadir in ${test_sets}; do + local/decode_diarized.sh --nj $nj --cmd "$decode_cmd" --stage $decode_diarize_stage \ + exp/${datadir}_${nnet_type}_seg_diarization data/$datadir data/lang \ + exp/chain_${train_set}_cleaned_rvb exp/nnet3_${train_set}_cleaned_rvb \ + data/${datadir}_diarized || exit 1 + done +fi + +####################################################################### +# Score decoded dev/eval sets +####################################################################### +if [ $stage -le 6 ]; then + # final scoring to get the challenge result + # please specify both dev and eval set directories so that the search parameters + # (insertion penalty and language model weight) will be tuned using the dev set + local/score_for_submit.sh --stage $score_stage \ + --dev_decodedir exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_dereverb_diarized_2stage \ + --dev_datadir dev_beamformit_dereverb_diarized_hires \ + --eval_decodedir exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_eval_beamformit_dereverb_diarized_2stage \ + --eval_datadir eval_beamformit_dereverb_diarized_hires +fi +exit 0; diff --git a/egs/chime6/s5b_track2/local/decode_diarized.sh b/egs/chime6/s5b_track2/local/decode_diarized.sh new file mode 100755 index 00000000000..f687b313893 --- /dev/null +++ b/egs/chime6/s5b_track2/local/decode_diarized.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +# Copyright 2019 Ashish Arora, Vimal Manohar +# Apache 2.0. +# This script takes an rttm file, and performs decoding on on a test directory. +# The output directory contains a text file which can be used for scoring. + + +stage=0 +nj=8 +cmd=queue.pl +echo "$0 $@" # Print the command line for logging +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 6 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/rttm data/dev data/lang_chain exp/chain_train_worn_simu_u400k_cleaned_rvb \ + exp/nnet3_train_worn_simu_u400k_cleaned_rvb data/dev_diarized" + echo "Options: " + echo " --nj # number of parallel jobs." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +rttm_dir=$1 +data_in=$2 +lang_dir=$3 +asr_model_dir=$4 +ivector_extractor=$5 +out_dir=$6 + +for f in $rttm_dir/rttm $data_in/wav.scp $data_in/text.bak \ + $lang_dir/L.fst $asr_model_dir/tree_sp/graph/HCLG.fst \ + $asr_model_dir/tdnn1b_sp/final.mdl; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +if [ $stage -le 0 ]; then + echo "$0 copying data files in output directory" + cp $rttm_dir/rttm $rttm_dir/rttm_1 + sed -i 's/'.ENH'/''/g' $rttm_dir/rttm_1 + # removing participant introduction from the hypothesis rttm + # UEM file contains the scoring durations for each recording + local/truncate_rttm.py $rttm_dir/rttm_1 local/uem_file $rttm_dir/rttm_introduction_removed + mkdir -p ${out_dir}_hires + cp ${data_in}/{wav.scp,utt2spk} ${out_dir}_hires + utils/data/get_reco2dur.sh ${out_dir}_hires +fi + +if [ $stage -le 1 ]; then + echo "$0 creating segments file from rttm and utt2spk, reco2file_and_channel " + local/convert_rttm_to_utt2spk_and_segments.py --append-reco-id-to-spkr=true $rttm_dir/rttm_introduction_removed \ + <(awk '{print $2".ENH "$2" "$3}' $rttm_dir/rttm_introduction_removed |sort -u) \ + ${out_dir}_hires/utt2spk ${out_dir}_hires/segments + + utils/utt2spk_to_spk2utt.pl ${out_dir}_hires/utt2spk > ${out_dir}_hires/spk2utt + + awk '{print $1" "$1" 1"}' ${out_dir}_hires/wav.scp > ${out_dir}_hires/reco2file_and_channel + utils/fix_data_dir.sh ${out_dir}_hires || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "$0 extracting mfcc freatures using segments file" + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd queue.pl ${out_dir}_hires + steps/compute_cmvn_stats.sh ${out_dir}_hires + cp $data_in/text.bak ${out_dir}_hires/text +fi + +if [ $stage -le 3 ]; then + echo "$0 performing decoding on the extracted features" + local/nnet3/decode.sh --affix 2stage --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 150 --nj $nj --ivector-dir $ivector_extractor \ + $out_dir $lang_dir $asr_model_dir/tree_sp/graph $asr_model_dir/tdnn1b_sp/ +fi + diff --git a/egs/chime6/s5b_track2/local/diarize.sh b/egs/chime6/s5b_track2/local/diarize.sh new file mode 100755 index 00000000000..d555e92c0e8 --- /dev/null +++ b/egs/chime6/s5b_track2/local/diarize.sh @@ -0,0 +1,119 @@ +#!/bin/bash +# Copyright 2019 David Snyder +# 2020 Desh Raj + +# Apache 2.0. +# +# This script takes an input directory that has a segments file (and +# a feats.scp file), and performs diarization on it. The output directory +# contains an RTTM file which can be used to resegment the input data. + +stage=0 +nj=10 +cmd="run.pl" +ref_rttm= + +echo "$0 $@" # Print the command line for logging +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 exp/xvector_nnet_1a data/dev exp/dev_diarization" + echo "Options: " + echo " --nj # number of parallel jobs." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --ref_rttm ./local/dev_rttm # the location of the reference RTTM file" + exit 1; +fi + +model_dir=$1 +data_in=$2 +out_dir=$3 + +name=`basename $data_in` + +for f in $data_in/feats.scp $data_in/segments $model_dir/plda \ + $model_dir/final.raw $model_dir/extract.config; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +if [ $stage -le 0 ]; then + echo "$0: keeping only data corresponding to array U06 " + echo "$0: we can skip this stage, to perform diarization on all arrays " + # to perform diarization ond scoring on all array please skip this step and + # pass all_array = true in local/multispeaker_score.sh + cp -r data/$name data/${name}.bak + mv data/$name/wav.scp data/$name/wav.scp.bak + grep 'U06' data/$name/wav.scp.bak > data/$name/wav.scp + utils/fix_data_dir.sh data/$name + nj=2 # since we have reduced number of "speakers" now +fi + +if [ $stage -le 1 ]; then + echo "$0: computing features for x-vector extractor" + utils/fix_data_dir.sh data/${name} + rm -rf data/${name}_cmn + local/nnet3/xvector/prepare_feats.sh --nj $nj --cmd "$cmd" \ + data/$name data/${name}_cmn exp/${name}_cmn + cp data/$name/segments exp/${name}_cmn/ + utils/fix_data_dir.sh data/${name}_cmn +fi + +if [ $stage -le 2 ]; then + echo "$0: extracting x-vectors for all segments" + diarization/nnet3/xvector/extract_xvectors.sh --cmd "$cmd" \ + --nj $nj --window 1.5 --period 0.75 --apply-cmn false \ + --min-segment 0.5 $model_dir \ + data/${name}_cmn $out_dir/xvectors_${name} +fi + +# Perform PLDA scoring +if [ $stage -le 3 ]; then + # Perform PLDA scoring on all pairs of segments for each recording. + echo "$0: performing PLDA scoring between all pairs of x-vectors" + diarization/nnet3/xvector/score_plda.sh --cmd "$cmd" \ + --target-energy 0.5 \ + --nj $nj $model_dir/ $out_dir/xvectors_${name} \ + $out_dir/xvectors_${name}/plda_scores +fi + +if [ $stage -le 4 ]; then + echo "$0: performing clustering using PLDA scores (we assume 4 speakers per recording)" + awk '{print $1, "4"}' data/$name/wav.scp > data/$name/reco2num_spk + diarization/cluster.sh --cmd "$cmd" --nj $nj \ + --reco2num-spk data/$name/reco2num_spk \ + --rttm-channel 1 \ + $out_dir/xvectors_${name}/plda_scores $out_dir + echo "$0: wrote RTTM to output directory ${out_dir}" +fi + +hyp_rttm=${out_dir}/rttm + +# For scoring the diarization system, we use the same tool that was +# used in the DIHARD II challenge. This is available at: +# https://github.com/nryant/dscore +# Note that the scoring takes a single reference RTTM and a single +# hypothesis RTTM. +if [ $stage -le 5 ]; then + # If a reference RTTM file is not provided, we create one using the backed up + # segments and utt2spk files in the original data directory. + if [ -z "$ref_rttm" ]; then + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py data/$name/utt2spk.bak \ + data/$name/segments.bak data/$name/rttm + ref_rttm=data/$name/rttm + fi + echo "Diarization results for "${name} + if ! [ -d dscore ]; then + git clone https://github.com/nryant/dscore.git || exit 1; + cd dscore + python -m pip install --user -r requirements.txt + cd .. + fi + sed 's/_U0[1-6]\.ENH//g' $ref_rttm > $ref_rttm.scoring + sed 's/_U0[1-6]\.ENH//g' $hyp_rttm > $hyp_rttm.scoring + ref_rttm_path=$(readlink -f ${ref_rttm}.scoring) + hyp_rttm_path=$(readlink -f ${hyp_rttm}.scoring) + cat ./local/uem_file | grep 'U06' | sed 's/_U0[1-6]//g' > ./local/uem_file.scoring + cd dscore && python score.py -u ../local/uem_file.scoring -r $ref_rttm_path \ + -s $hyp_rttm_path && cd .. || exit 1; +fi diff --git a/egs/chime6/s5b_track2/local/distant_audio_list b/egs/chime6/s5b_track2/local/distant_audio_list new file mode 120000 index 00000000000..0455876cf4d --- /dev/null +++ b/egs/chime6/s5b_track2/local/distant_audio_list @@ -0,0 +1 @@ +../../s5_track1/local/distant_audio_list \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/extract_noises.py b/egs/chime6/s5b_track2/local/extract_noises.py new file mode 120000 index 00000000000..04a6389916d --- /dev/null +++ b/egs/chime6/s5b_track2/local/extract_noises.py @@ -0,0 +1 @@ +../../s5_track1/local/extract_noises.py \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/extract_vad_weights.sh b/egs/chime6/s5b_track2/local/extract_vad_weights.sh new file mode 120000 index 00000000000..0db29cded5d --- /dev/null +++ b/egs/chime6/s5b_track2/local/extract_vad_weights.sh @@ -0,0 +1 @@ +../../s5_track1/local/extract_vad_weights.sh \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/gen_aligned_hyp.py b/egs/chime6/s5b_track2/local/gen_aligned_hyp.py new file mode 100755 index 00000000000..acaa3a13ad5 --- /dev/null +++ b/egs/chime6/s5b_track2/local/gen_aligned_hyp.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +# Copyright 2019 Yusuke Fujita +# Apache 2.0. + +"""This script generates hypothesis utterances aligned with reference segments. + Usage: gen_align_hyp.py alignment.txt wc.txt > hyp.txt + alignment.txt is a session-level word alignment generated by align-text command. + wc.txt is a sequence of utt-id:reference_word_count generated by 'local/get_ref_perspeaker_persession_file.py'. +""" + +import sys, io +import string +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +def load_align_text(f): + alignments = {} + for line in f: + recoid, res = line.split(None, 1) + alignments[recoid] = [] + toks = res.split(';') + for tok in toks: + ref, hyp = tok.split() + alignments[recoid].append((ref, hyp)) + return alignments + +alignments = load_align_text(open(sys.argv[1],'r', encoding='utf8')) + +for line in open(sys.argv[2],'r', encoding='utf8'): + recoid, res = line.split(None, 1) + ali = iter(alignments[recoid]) + toks = res.split() + for tok in toks: + uttid, count = tok.split(':') + count = int(count) + text = '' + for i in range(count): + while True: + ref, hyp = ali.__next__() + if hyp != '': + text += ' ' + hyp + if ref != '': + break + output.write(uttid + ' ' + text.strip() + '\n') diff --git a/egs/chime6/s5b_track2/local/generate_chime6_data.sh b/egs/chime6/s5b_track2/local/generate_chime6_data.sh new file mode 120000 index 00000000000..62882cd6279 --- /dev/null +++ b/egs/chime6/s5b_track2/local/generate_chime6_data.sh @@ -0,0 +1 @@ +../../s5_track1/local/generate_chime6_data.sh \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/get_best_error.py b/egs/chime6/s5b_track2/local/get_best_error.py new file mode 100755 index 00000000000..b9d8b0d43e7 --- /dev/null +++ b/egs/chime6/s5b_track2/local/get_best_error.py @@ -0,0 +1,84 @@ +#! /usr/bin/env python3 +# Copyright 2019 Ashish Arora +# Apache 2.0. +"""This script finds best matching of reference and hypothesis speakers. + For the best matching speakers,it provides the WER for the reference session + (eg:S02) and hypothesis recording (eg: S02_U02)""" + +import itertools +import numpy as np +import argparse +from munkres import Munkres + +def get_args(): + parser = argparse.ArgumentParser( + description="""This script finds best matching of reference and hypothesis speakers. + For the best matching it provides the WER""") + parser.add_argument("WER_dir", type=str, + help="path of WER files") + parser.add_argument("recording_id", type=str, + help="recording_id name") + parser.add_argument("num_speakers", type=str, + help="number of speakers in ref") + args = parser.parse_args() + return args + + +def get_results(filename): + with open(filename) as f: + first_line = f.readline() + parts = first_line.strip().split(',') + total_words = parts[0].split()[-1] + ins = parts[1].split()[0] + deletions = parts[2].split()[0] + sub = parts[3].split()[0] + return total_words, ins, deletions, sub + + +def get_min_wer(recording_id, num_speakers, WER_dir): + best_wer_file = WER_dir + '/' + 'best_wer' + '_' + recording_id + best_wer_writer = open(best_wer_file, 'w') + m = Munkres() + total_error_mat = [0] * num_speakers + all_errors_mat = [0] * num_speakers + for i in range(num_speakers): + total_error_mat[i] = [0] * num_speakers + all_errors_mat[i] = [0] * num_speakers + for i in range(1, num_speakers+1): + for j in range(1, num_speakers+1): + filename = '/wer_' + recording_id + '_' + 'r' + str(i)+ 'h' + str(j) + filename = WER_dir + filename + total_words, ins, deletions, sub = get_results(filename) + ins = int(ins) + deletions = int(deletions) + sub = int(sub) + total_error = ins + deletions + sub + total_error_mat[i-1][j-1]=total_error + all_errors_mat[i-1][j-1]= (total_words, total_error, ins, deletions, sub) + + indexes = m.compute(total_error_mat) + total_errors=total_words=total_ins=total_del=total_sub=0 + spk_order = '(' + for row, column in indexes: + words, errs, ins, deletions, sub = all_errors_mat[row][column] + total_errors += int(errs) + total_words += int(words) + total_ins += int(ins) + total_del += int(deletions) + total_sub += int(sub) + spk_order = spk_order + str(column+1) + ', ' + spk_order = spk_order + ')' + text = "Best error: (#T #E #I #D #S) " + str(total_words)+ ', '+str(total_errors)+ ', '+str(total_ins)+ ', '+str(total_del)+ ', '+str(total_sub) + best_wer_writer.write(" recording_id: "+ recording_id + ' ') + best_wer_writer.write(' best hypothesis speaker order: ' + spk_order + ' ') + best_wer_writer.write(text+ '\n') + best_wer_writer.close() + + +def main(): + args = get_args() + get_min_wer(args.recording_id, int(args.num_speakers), args.WER_dir) + + +if __name__ == '__main__': + main() diff --git a/egs/chime6/s5b_track2/local/get_hyp_perspeaker_perarray_file.py b/egs/chime6/s5b_track2/local/get_hyp_perspeaker_perarray_file.py new file mode 100755 index 00000000000..091cf7c05b1 --- /dev/null +++ b/egs/chime6/s5b_track2/local/get_hyp_perspeaker_perarray_file.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python +# Copyright 2019 Ashish Arora +# Apache 2.0. +"""This script splits a kaldi (text) file + into per_array per_session per_speaker hypothesis (text) files""" + +import argparse +def get_args(): + parser = argparse.ArgumentParser( + description="""This script splits a kaldi text file + into per_array per_session per_speaker text files""") + parser.add_argument("input_text_path", type=str, + help="path of text files") + parser.add_argument("output_dir_path", type=str, + help="Output path for per_array per_session per_speaker reference files") + args = parser.parse_args() + return args + + +def main(): + # S09_U06.ENH-4-704588-704738 + args = get_args() + sessionid_micid_speakerid_dict= {} + for line in open(args.input_text_path): + parts = line.strip().split() + uttid_id = parts[0] + temp = uttid_id.strip().split('.')[0] + micid = temp.strip().split('_')[1] + speakerid = uttid_id.strip().split('-')[1] + sessionid = uttid_id.strip().split('_')[0] + sessionid_micid_speakerid = sessionid + '_' + micid + '_' + speakerid + if sessionid_micid_speakerid not in sessionid_micid_speakerid_dict: + sessionid_micid_speakerid_dict[sessionid_micid_speakerid]=list() + sessionid_micid_speakerid_dict[sessionid_micid_speakerid].append(line) + + for sessionid_micid_speakerid in sorted(sessionid_micid_speakerid_dict): + hyp_file = args.output_dir_path + '/' + 'hyp' + '_' + sessionid_micid_speakerid + hyp_writer = open(hyp_file, 'w') + combined_hyp_file = args.output_dir_path + '/' + 'hyp' + '_' + sessionid_micid_speakerid + '_comb' + combined_hyp_writer = open(combined_hyp_file, 'w') + utterances = sessionid_micid_speakerid_dict[sessionid_micid_speakerid] + # sorting utterances by start and end time + sessionid_micid_speakerid_utterances={} + for line in utterances: + parts = line.strip().split() + utt_parts = parts[0].strip().split('-') + time ='-'.join(utt_parts[2:]) + sessionid_micid_speakerid_utterances[time] = line + text = '' + for time_key in sorted(sessionid_micid_speakerid_utterances): + parts = sessionid_micid_speakerid_utterances[time_key].strip().split() + text = text + ' ' + ' '.join(parts[1:]) + hyp_writer.write(sessionid_micid_speakerid_utterances[time_key]) + combined_utterance = 'utt' + " " + text + combined_hyp_writer.write(combined_utterance) + combined_hyp_writer.write('\n') + combined_hyp_writer.close() + hyp_writer.close() + + +if __name__ == '__main__': + main() + diff --git a/egs/chime6/s5b_track2/local/get_ref_perspeaker_persession_file.py b/egs/chime6/s5b_track2/local/get_ref_perspeaker_persession_file.py new file mode 100755 index 00000000000..a4394984876 --- /dev/null +++ b/egs/chime6/s5b_track2/local/get_ref_perspeaker_persession_file.py @@ -0,0 +1,86 @@ +#! /usr/bin/env python +# Copyright 2019 Ashish Arora +# Apache 2.0. +"""This script splits a kaldi (text) file + into per_speaker per_session reference (text) file""" + +import argparse + +def get_args(): + parser = argparse.ArgumentParser( + description="""This script splits a kaldi text file + into per_speaker per_session text files""") + parser.add_argument("input_text_path", type=str, + help="path of text file") + parser.add_argument("output_dir_path", type=str, + help="Output path for per_session per_speaker reference files") + args = parser.parse_args() + return args + + +def main(): + args = get_args() + sessionid_speakerid_dict= {} + spkrid_mapping = {} + for line in open(args.input_text_path): + parts = line.strip().split() + uttid_id = parts[0] + speakerid = uttid_id.strip().split('_')[0] + sessionid = uttid_id.strip().split('_')[1] + sessionid_speakerid = sessionid + '_' + speakerid + if sessionid_speakerid not in sessionid_speakerid_dict: + sessionid_speakerid_dict[sessionid_speakerid]=list() + sessionid_speakerid_dict[sessionid_speakerid].append(line) + + spkr_num = 1 + prev_sessionid = '' + for sessionid_speakerid in sorted(sessionid_speakerid_dict): + spkr_id = sessionid_speakerid.strip().split('_')[1] + curr_sessionid = sessionid_speakerid.strip().split('_')[0] + if prev_sessionid != curr_sessionid: + prev_sessionid = curr_sessionid + spkr_num = 1 + if spkr_id not in spkrid_mapping: + spkrid_mapping[spkr_id] = spkr_num + spkr_num += 1 + + for sessionid_speakerid in sorted(sessionid_speakerid_dict): + ref_file = args.output_dir_path + '/ref_' + sessionid_speakerid.split('_')[0] + '_' + str( + spkrid_mapping[sessionid_speakerid.split('_')[1]]) + ref_writer = open(ref_file, 'w') + wc_file = args.output_dir_path + '/ref_wc_' + sessionid_speakerid.split('_')[0] + '_' + str( + spkrid_mapping[sessionid_speakerid.split('_')[1]]) + wc_writer = open(wc_file, 'w') + combined_ref_file = args.output_dir_path + '/ref_' + sessionid_speakerid.split('_')[0] + '_' + str( + spkrid_mapping[sessionid_speakerid.split('_')[1]]) + '_comb' + combined_ref_writer = open(combined_ref_file, 'w') + utterances = sessionid_speakerid_dict[sessionid_speakerid] + sessionid_speakerid_utterances = {} + # sorting utterances by start and end time + for line in utterances: + parts = line.strip().split() + utt_parts = parts[0].strip().split('-') + time ='-'.join(utt_parts[1:]) + sessionid_speakerid_utterances[time] = line + text = '' + uttid_wc = 'utt' + for time_key in sorted(sessionid_speakerid_utterances): + parts = sessionid_speakerid_utterances[time_key].strip().split() + uttid_id = parts[0] + utt_text = ' '.join(parts[1:]) + text = text + ' ' + ' '.join(parts[1:]) + ref_writer.write(sessionid_speakerid_utterances[time_key]) + length = str(len(utt_text.split())) + uttid_id_len = uttid_id + ":" + length + uttid_wc = uttid_wc + ' ' + uttid_id_len + combined_utterance = 'utt' + " " + text + combined_ref_writer.write(combined_utterance) + combined_ref_writer.write('\n') + combined_ref_writer.close() + wc_writer.write(uttid_wc) + wc_writer.write('\n') + wc_writer.close() + ref_writer.close() + +if __name__ == '__main__': + main() diff --git a/egs/chime6/s5b_track2/local/install_dscore.sh b/egs/chime6/s5b_track2/local/install_dscore.sh new file mode 100755 index 00000000000..314f86f938e --- /dev/null +++ b/egs/chime6/s5b_track2/local/install_dscore.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +# Installs dscore +git clone https://github.com/nryant/dscore.git +pip3 install intervaltree --user +pip3 install tabulate --user +pip3 install munkres --user +pip3 install pytest --user diff --git a/egs/chime6/s5b_track2/local/install_pb_chime5.sh b/egs/chime6/s5b_track2/local/install_pb_chime5.sh new file mode 120000 index 00000000000..ce5ea5f9f08 --- /dev/null +++ b/egs/chime6/s5b_track2/local/install_pb_chime5.sh @@ -0,0 +1 @@ +../../s5_track1/local/install_pb_chime5.sh \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/json2text.py b/egs/chime6/s5b_track2/local/json2text.py new file mode 120000 index 00000000000..2aa0a8dd1f9 --- /dev/null +++ b/egs/chime6/s5b_track2/local/json2text.py @@ -0,0 +1 @@ +../../s5_track1/local/json2text.py \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/make_noise_list.py b/egs/chime6/s5b_track2/local/make_noise_list.py new file mode 120000 index 00000000000..d8dcc7822fc --- /dev/null +++ b/egs/chime6/s5b_track2/local/make_noise_list.py @@ -0,0 +1 @@ +../../s5_track1/local/make_noise_list.py \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/make_voxceleb1.pl b/egs/chime6/s5b_track2/local/make_voxceleb1.pl new file mode 100755 index 00000000000..2268c20ab52 --- /dev/null +++ b/egs/chime6/s5b_track2/local/make_voxceleb1.pl @@ -0,0 +1,130 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# 2018 David Snyder +# +# Usage: make_voxceleb1.pl /export/voxceleb1 data/ + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb1 data/\n"; + exit(1); +} + +($data_base, $out_dir) = @ARGV; +my $out_test_dir = "$out_dir/voxceleb1_test"; +my $out_train_dir = "$out_dir/voxceleb1_train"; + +if (system("mkdir -p $out_test_dir") != 0) { + die "Error making directory $out_test_dir"; +} + +if (system("mkdir -p $out_train_dir") != 0) { + die "Error making directory $out_train_dir"; +} + +opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +if (! -e "$data_base/voxceleb1_test.txt") { + system("wget -O $data_base/voxceleb1_test.txt http://www.openslr.org/resources/49/voxceleb1_test.txt"); +} + +if (! -e "$data_base/vox1_meta.csv") { + system("wget -O $data_base/vox1_meta.csv http://www.openslr.org/resources/49/vox1_meta.csv"); +} + +open(TRIAL_IN, "<", "$data_base/voxceleb1_test.txt") or die "Could not open the verification trials file $data_base/voxceleb1_test.txt"; +open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv"; +open(SPKR_TEST, ">", "$out_test_dir/utt2spk") or die "Could not open the output file $out_test_dir/utt2spk"; +open(WAV_TEST, ">", "$out_test_dir/wav.scp") or die "Could not open the output file $out_test_dir/wav.scp"; +open(SPKR_TRAIN, ">", "$out_train_dir/utt2spk") or die "Could not open the output file $out_train_dir/utt2spk"; +open(WAV_TRAIN, ">", "$out_train_dir/wav.scp") or die "Could not open the output file $out_train_dir/wav.scp"; +open(TRIAL_OUT, ">", "$out_test_dir/trials") or die "Could not open the output file $out_test_dir/trials"; + +my %id2spkr = (); +while () { + chomp; + my ($vox_id, $spkr_id, $gender, $nation, $set) = split; + $id2spkr{$vox_id} = $spkr_id; +} + +my $test_spkrs = (); +while () { + chomp; + my ($tar_or_non, $path1, $path2) = split; + + # Create entry for left-hand side of trial + my ($spkr_id, $filename) = split('/', $path1); + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $utt_id1 = "$spkr_id-$rec_id-$segment"; + $test_spkrs{$spkr_id} = (); + + # Create entry for right-hand side of trial + my ($spkr_id, $filename) = split('/', $path2); + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $utt_id2 = "$spkr_id-$rec_id-$segment"; + $test_spkrs{$spkr_id} = (); + + my $target = "nontarget"; + if ($tar_or_non eq "1") { + $target = "target"; + } + print TRIAL_OUT "$utt_id1 $utt_id2 $target\n"; +} + +foreach (@spkr_dirs) { + my $spkr_id = $_; + my $new_spkr_id = $spkr_id; + # If we're using a newer version of VoxCeleb1, we need to "deanonymize" + # the speaker labels. + if (exists $id2spkr{$spkr_id}) { + $new_spkr_id = $id2spkr{$spkr_id}; + } + opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@files) { + my $filename = $_; + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav"; + my $utt_id = "$new_spkr_id-$rec_id-$segment"; + if (exists $test_spkrs{$new_spkr_id}) { + print WAV_TEST "$utt_id", " $wav", "\n"; + print SPKR_TEST "$utt_id", " $new_spkr_id", "\n"; + } else { + print WAV_TRAIN "$utt_id", " $wav", "\n"; + print SPKR_TRAIN "$utt_id", " $new_spkr_id", "\n"; + } + } +} + +close(SPKR_TEST) or die; +close(WAV_TEST) or die; +close(SPKR_TRAIN) or die; +close(WAV_TRAIN) or die; +close(TRIAL_OUT) or die; +close(TRIAL_IN) or die; +close(META_IN) or die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_test_dir/utt2spk >$out_test_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_test_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_test_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_test_dir") != 0) { + die "Error validating directory $out_test_dir"; +} + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_train_dir/utt2spk >$out_train_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_train_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_train_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_train_dir") != 0) { + die "Error validating directory $out_train_dir"; +} diff --git a/egs/chime6/s5b_track2/local/make_voxceleb2.pl b/egs/chime6/s5b_track2/local/make_voxceleb2.pl new file mode 100755 index 00000000000..34c1591eba3 --- /dev/null +++ b/egs/chime6/s5b_track2/local/make_voxceleb2.pl @@ -0,0 +1,70 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# +# Usage: make_voxceleb2.pl /export/voxceleb2 dev data/dev +# +# Note: This script requires ffmpeg to be installed and its location included in $PATH. + +if (@ARGV != 3) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb2 dev data/dev\n"; + exit(1); +} + +# Check that ffmpeg is installed. +if (`which ffmpeg` eq "") { + die "Error: this script requires that ffmpeg is installed."; +} + +($data_base, $dataset, $out_dir) = @ARGV; + +if ("$dataset" ne "dev" && "$dataset" ne "test") { + die "dataset parameter must be 'dev' or 'test'!"; +} + +opendir my $dh, "$data_base/$dataset/aac" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$data_base/$dataset/aac/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +if (system("mkdir -p $out_dir") != 0) { + die "Error making directory $out_dir"; +} + +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; + +foreach (@spkr_dirs) { + my $spkr_id = $_; + + opendir my $dh, "$data_base/$dataset/aac/$spkr_id/" or die "Cannot open directory: $!"; + my @rec_dirs = grep {-d "$data_base/$dataset/aac/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh); + closedir $dh; + + foreach (@rec_dirs) { + my $rec_id = $_; + + opendir my $dh, "$data_base/$dataset/aac/$spkr_id/$rec_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.m4a$/} readdir($dh); + closedir $dh; + + foreach (@files) { + my $name = $_; + my $wav = "ffmpeg -v 8 -i $data_base/$dataset/aac/$spkr_id/$rec_id/$name.m4a -f wav -acodec pcm_s16le -|"; + my $utt_id = "$spkr_id-$rec_id-$name"; + print WAV "$utt_id", " $wav", "\n"; + print SPKR "$utt_id", " $spkr_id", "\n"; + } + } +} +close(SPKR) or die; +close(WAV) or die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/chime6/s5b_track2/local/multispeaker_score.sh b/egs/chime6/s5b_track2/local/multispeaker_score.sh new file mode 100755 index 00000000000..c7075d6cf14 --- /dev/null +++ b/egs/chime6/s5b_track2/local/multispeaker_score.sh @@ -0,0 +1,144 @@ +#!/usr/bin/env bash +# Copyright 2019 Ashish Arora, Yusuke Fujita +# Apache 2.0. +# This script takes a reference and hypothesis text file, and performs +# multispeaker scoring. + +stage=0 +cmd=queue.pl +num_spkrs=4 +num_hyp_spk=4 +datadir=dev_beamformit_dereverb +get_stats=true +all_array=false +declare -a recording_id_array=("S02_U06" "S09_U06") +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/diarized/text data/dev \ + exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_xvector_sad/scoring_kaldi/penalty_1.0/10.txt \ + exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_xvector_sad/scoring_kaldi_multispeaker" + echo "Options: " + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +ref_file=$1 +hyp_file=$2 +out_dir=$3 + +output_dir=$out_dir/per_speaker_output +wer_dir=$out_dir/per_speaker_wer + +# For dev and evaluation set, we take corresopnding arrays +if [[ ${datadir} == *dev* ]]; then + recording_id_array=("S02_U06" "S09_U06") +fi + +if [[ ${datadir} == *eval* ]]; then + recording_id_array=("S01_U06" "S21_U06") +fi + +if [[ ${datadir} == *dev* ]] && [[ $all_array == "true" ]]; then + recording_id_array=("S02_U01" "S02_U02" "S02_U03" "S02_U04" "S02_U06" "S09_U01" "S09_U02" "S09_U03" "S09_U04" "S09_U06") +fi + +if [[ ${datadir} == *eval* ]] && [[ $all_array == "true" ]]; then + recording_id_array=("S01_U01" "S01_U02" "S01_U03" "S01_U04" "S01_U06" "S21_U01" "S21_U02" "S21_U03" "S21_U04" "S21_U06") +fi + +for f in $ref_file $hyp_file; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +if [ $stage -le 0 ]; then + # generate per speaker per session file at paragraph level for the reference" + # and per speaker per array file at paraghaph level for the hypothesis" + mkdir -p $output_dir $wer_dir + local/wer_output_filter < $ref_file > $output_dir/ref_filt.txt + local/wer_output_filter < $hyp_file > $output_dir/hyp_filt.txt + local/get_ref_perspeaker_persession_file.py $output_dir/ref_filt.txt $output_dir + local/get_hyp_perspeaker_perarray_file.py $output_dir/hyp_filt.txt $output_dir +fi + +if [ $stage -le 1 ]; then + if [ $num_hyp_spk -le 3 ]; then + # create dummy per speaker per array hypothesis files for if the" + # perdicted number of speakers by diarization is less than 4 " + for recording_id in "${recording_id_array[@]}"; do + for (( i=$num_hyp_spk+1; i<$num_spkrs+1; i++ )); do + echo 'utt ' > ${dir}/hyp_${recording_id}_${i}_comb + done + done + fi +fi + +if [ $stage -le 2 ]; then + # calculate wer for each ref and hypothesis speaker" + for recording_id in "${recording_id_array[@]}"; do + for (( i=0; i<$((num_spkrs * num_spkrs)); i++ )); do + ind_r=$((i / num_spkrs + 1)) + ind_h=$((i % num_spkrs + 1)) + sessionid="$(echo $recording_id | cut -d'_' -f1)" + + # compute WER with combined texts + compute-wer --text --mode=present ark:${output_dir}/ref_${sessionid}_${ind_r}_comb \ + ark:${output_dir}/hyp_${recording_id}_${ind_h}_comb \ + > $wer_dir/wer_${recording_id}_r${ind_r}h${ind_h} 2>/dev/null + done + + local/get_best_error.py $wer_dir $recording_id $num_spkrs + done +fi + +if [ $stage -le 3 ]; then + # print best word error rate" + # it will print best wer for each recording and each array" + cat $wer_dir/best_wer* > $wer_dir/all.txt + cat $wer_dir/all.txt | local/print_dset_error.py \ + $output_dir/recordinid_spkorder > $wer_dir/array_wer.txt +fi + +if [ $stage -le 4 ]; then + # checks if DP result of total error is equivalent + # to the sum of the individual errors: + local/check_dset_error.py $wer_dir $output_dir +fi + +if [ $stage -le 5 ] && [[ $get_stats == "true" ]]; then + # generate per utterance wer details at utterance level + mkdir -p $wer_dir/wer_details $wer_dir/wer_details/log/ + while read -r line; + do + recording_id=$(echo "$line" | cut -f1 -d ":") + spkorder_str=$(echo "$line" | cut -f2 -d ":") + sessionid=$(echo "$line" | cut -f1 -d "_") + IFS='_' read -r -a spkorder_list <<< "$spkorder_str" + IFS=" " + ind_r=1 + for ind_h in "${spkorder_list[@]}"; do + + $cmd $wer_dir/wer_details/log/${recording_id}_r${ind_r}h${ind_h}_comb.log \ + align-text ark:${output_dir}/ref_${sessionid}_${ind_r}_comb ark:${output_dir}/hyp_${recording_id}_${ind_h}_comb ark:$output_dir/alignment_${sessionid}_r${ind_r}h${ind_h}.txt + + # split hypothesis texts along with reference utterances using word alignment of combined texts + local/gen_aligned_hyp.py $output_dir/alignment_${sessionid}_r${ind_r}h${ind_h}.txt ${output_dir}/ref_wc_${sessionid}_${ind_r} > ${output_dir}/hyp_${recording_id}_r${ind_r}h${ind_h}_ref_segmentation + + ## compute per utterance alignments + $cmd $wer_dir/wer_details/log/${recording_id}_r${ind_r}h${ind_h}_per_utt.log \ + cat ${output_dir}/hyp_${recording_id}_r${ind_r}h${ind_h}_ref_segmentation \| \ + align-text --special-symbol="'***'" ark:${output_dir}/ref_${sessionid}_${ind_r} ark:- ark,t:- \| \ + utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $wer_dir/wer_details/per_utt_${recording_id}_r${ind_r}h${ind_h} || exit 1 + + $cmd $wer_dir/wer_details/log/${recording_id}_r${ind_r}h${ind_h}_ops.log \ + cat $wer_dir/wer_details/per_utt_${recording_id}_r${ind_r}h${ind_h} \| \ + utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \ + sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $wer_dir/wer_details/ops_${recording_id}_r${ind_r}h${ind_h} || exit 1; + + ind_r=$(( ind_r + 1 )) + done + done < $output_dir/recordinid_spkorder + # done generating per utterance wer details +fi diff --git a/egs/chime6/s5b_track2/local/nnet3/compare_wer.sh b/egs/chime6/s5b_track2/local/nnet3/compare_wer.sh new file mode 120000 index 00000000000..87041e833d0 --- /dev/null +++ b/egs/chime6/s5b_track2/local/nnet3/compare_wer.sh @@ -0,0 +1 @@ +../../../s5_track1/local/nnet3/compare_wer.sh \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/nnet3/decode.sh b/egs/chime6/s5b_track2/local/nnet3/decode.sh new file mode 120000 index 00000000000..32595ccedbc --- /dev/null +++ b/egs/chime6/s5b_track2/local/nnet3/decode.sh @@ -0,0 +1 @@ +../../../s5_track1/local/nnet3/decode.sh \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/nnet3/run_ivector_common.sh b/egs/chime6/s5b_track2/local/nnet3/run_ivector_common.sh new file mode 120000 index 00000000000..4161993c225 --- /dev/null +++ b/egs/chime6/s5b_track2/local/nnet3/run_ivector_common.sh @@ -0,0 +1 @@ +../../../s5_track1/local/nnet3/run_ivector_common.sh \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/nnet3/xvector/prepare_feats.sh b/egs/chime6/s5b_track2/local/nnet3/xvector/prepare_feats.sh new file mode 100755 index 00000000000..6b5ccd466c3 --- /dev/null +++ b/egs/chime6/s5b_track2/local/nnet3/xvector/prepare_feats.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +# +# Apache 2.0. + +# This script applies sliding window CMVN and writes the features to disk. +# +# Although this kind of script isn't necessary in speaker recognition recipes, +# it can be helpful in the diarization recipes. The script +# diarization/nnet3/xvector/extract_xvectors.sh extracts x-vectors from very +# short (e.g., 1-2 seconds) segments. Therefore, in order to apply the sliding +# window CMVN in a meaningful way, it must be performed prior to performing +# the subsegmentation. + +nj=40 +cmd="run.pl" +stage=0 +norm_vars=false +center=true +compress=true +cmn_window=300 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features" + echo "Options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --norm-vars # If true, normalize variances in the sliding window cmvn" + exit 1; +fi + +data_in=$1 +data_out=$2 +dir=$3 + +name=`basename $data_in` + +for f in $data_in/feats.scp ; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +# Set various variables. +mkdir -p $dir/log +mkdir -p $data_out +featdir=$(utils/make_absolute.sh $dir) + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_cmvn_feats/storage $featdir/storage +fi + +for n in $(seq $nj); do + # the next command does nothing unless $featdir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $featdir/xvector_cmvn_feats_${name}.${n}.ark +done + +cp $data_in/utt2spk $data_out/utt2spk +cp $data_in/spk2utt $data_out/spk2utt +cp $data_in/wav.scp $data_out/wav.scp +for f in $data_in/segments $data_in/segments/vad.scp ; do + [ -f $f ] && cp $f $data_out/`basename $f`; +done + +write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB" + +sdata_in=$data_in/split$nj; +utils/split_data.sh $data_in $nj || exit 1; + +$cmd JOB=1:$nj $dir/log/create_xvector_cmvn_feats_${name}.JOB.log \ + apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \ + scp:${sdata_in}/JOB/feats.scp ark:- \| \ + copy-feats --compress=$compress $write_num_frames_opt ark:- \ + ark,scp:$featdir/xvector_cmvn_feats_${name}.JOB.ark,$featdir/xvector_cmvn_feats_${name}.JOB.scp || exit 1; + +for n in $(seq $nj); do + cat $featdir/xvector_cmvn_feats_${name}.$n.scp || exit 1; +done > ${data_out}/feats.scp || exit 1 + +for n in $(seq $nj); do + cat $featdir/log/utt2num_frames.$n || exit 1; +done > $data_out/utt2num_frames || exit 1 +rm $featdir/log/utt2num_frames.* + +echo "$0: Succeeded creating xvector features for $name" diff --git a/egs/chime6/s5b_track2/local/nnet3/xvector/prepare_feats_for_egs.sh b/egs/chime6/s5b_track2/local/nnet3/xvector/prepare_feats_for_egs.sh new file mode 100755 index 00000000000..326b6dbb9fa --- /dev/null +++ b/egs/chime6/s5b_track2/local/nnet3/xvector/prepare_feats_for_egs.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +# +# Apache 2.0. + +# This script applies sliding window CMVN and removes silence frames. This +# is performed on the raw features prior to generating examples for training +# the x-vector system. Once the training examples are generated, the features +# created by this script can be removed. + +nj=40 +cmd="run.pl" +stage=0 +norm_vars=false +center=true +compress=true +cmn_window=300 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features" + echo "Options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --norm-vars # If true, normalize variances in the sliding window cmvn" + exit 1; +fi + +data_in=$1 +data_out=$2 +dir=$3 + +name=`basename $data_in` + +for f in $data_in/feats.scp $data_in/vad.scp ; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +# Set various variables. +mkdir -p $dir/log +mkdir -p $data_out +featdir=$(utils/make_absolute.sh $dir) + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_feats/storage $featdir/storage +fi + +for n in $(seq $nj); do + # the next command does nothing unless $featdir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $featdir/xvector_feats_${name}.${n}.ark +done + +cp $data_in/utt2spk $data_out/utt2spk +cp $data_in/spk2utt $data_out/spk2utt +cp $data_in/wav.scp $data_out/wav.scp + +write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB" + +sdata_in=$data_in/split$nj; +utils/split_data.sh $data_in $nj || exit 1; + +$cmd JOB=1:$nj $dir/log/create_xvector_feats_${name}.JOB.log \ + apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \ + scp:${sdata_in}/JOB/feats.scp ark:- \| \ + select-voiced-frames ark:- scp,s,cs:${sdata_in}/JOB/vad.scp ark:- \| \ + copy-feats --compress=$compress $write_num_frames_opt ark:- \ + ark,scp:$featdir/xvector_feats_${name}.JOB.ark,$featdir/xvector_feats_${name}.JOB.scp || exit 1; + +for n in $(seq $nj); do + cat $featdir/xvector_feats_${name}.$n.scp || exit 1; +done > ${data_out}/feats.scp || exit 1 + +for n in $(seq $nj); do + cat $featdir/log/utt2num_frames.$n || exit 1; +done > $data_out/utt2num_frames || exit 1 +rm $featdir/log/utt2num_frames.* + +echo "$0: Succeeded creating xvector features for $name" diff --git a/egs/chime6/s5b_track2/local/nnet3/xvector/run_xvector.sh b/egs/chime6/s5b_track2/local/nnet3/xvector/run_xvector.sh new file mode 120000 index 00000000000..585b63fd2dd --- /dev/null +++ b/egs/chime6/s5b_track2/local/nnet3/xvector/run_xvector.sh @@ -0,0 +1 @@ +tuning/run_xvector_1a.sh \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/nnet3/xvector/tuning/run_xvector_1a.sh b/egs/chime6/s5b_track2/local/nnet3/xvector/tuning/run_xvector_1a.sh new file mode 100755 index 00000000000..2189e406a7e --- /dev/null +++ b/egs/chime6/s5b_track2/local/nnet3/xvector/tuning/run_xvector_1a.sh @@ -0,0 +1,149 @@ +#!/usr/bin/env bash +# Copyright 2018 David Snyder +# 2018 Johns Hopkins University (Author: Daniel Garcia-Romero) +# 2018 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + +# This script trains the x-vector DNN. The recipe is similar to the one +# described in "Diarization is Hard: Some Experiences and Lessons Learned +# for the JHU Team in the Inaugural DIHARD Challenge" by Sell et al. + +. ./cmd.sh +set -e + +stage=1 +train_stage=-1 +use_gpu=true +remove_egs=false + +data=data/train +nnet_dir=exp/xvector_nnet_1a/ +egs_dir=exp/xvector_nnet_1a/egs + +. ./path.sh +. ./cmd.sh +. ./utils/parse_options.sh + +num_pdfs=$(awk '{print $2}' $data/utt2spk | sort | uniq -c | wc -l) + +# Now we create the nnet examples using sid/nnet3/xvector/get_egs.sh. +# The argument --num-repeats is related to the number of times a speaker +# repeats per archive. If it seems like you're getting too many archives +# (e.g., more than 200) try increasing the --frames-per-iter option. The +# arguments --min-frames-per-chunk and --max-frames-per-chunk specify the +# minimum and maximum length (in terms of number of frames) of the features +# in the examples. +# +# To make sense of the egs script, it may be necessary to put an "exit 1" +# command immediately after stage 3. Then, inspect +# exp//egs/temp/ranges.* . The ranges files specify the examples that +# will be created, and which archives they will be stored in. Each line of +# ranges.* has the following form: +# +# For example: +# 100304-f-sre2006-kacg-A 1 2 4079 881 23 + +# If you're satisfied with the number of archives (e.g., 50-150 archives is +# reasonable) and with the number of examples per speaker (e.g., 1000-5000 +# is reasonable) then you can let the script continue to the later stages. +# Otherwise, try increasing or decreasing the --num-repeats option. You might +# need to fiddle with --frames-per-iter. Increasing this value decreases the +# the number of archives and increases the number of examples per archive. +# Decreasing this value increases the number of archives, while decreasing the +# number of examples per archive. +if [ $stage -le 6 ]; then + echo "$0: Getting neural network training egs"; + # dump egs. + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b{03,04,05,06}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/$egs_dir/storage $egs_dir/storage + fi + sid/nnet3/xvector/get_egs.sh --cmd "$train_cmd" \ + --nj 8 \ + --stage 0 \ + --frames-per-iter 1000000000 \ + --frames-per-iter-diagnostic 500000 \ + --min-frames-per-chunk 200 \ + --max-frames-per-chunk 400 \ + --num-diagnostic-archives 3 \ + --num-repeats 40 \ + "$data" $egs_dir +fi + +if [ $stage -le 7 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(wc -w $egs_dir/pdf2num | awk '{print $1}') + feat_dim=$(cat $egs_dir/info/feat_dim) + + # This chunk-size corresponds to the maximum number of frames the + # stats layer is able to pool over. In this script, it corresponds + # to 4 seconds. If the input recording is greater than 4 seconds, + # we will compute multiple xvectors from the same recording and average + # to produce the final xvector. + max_chunk_size=400 + + # The smallest number of frames we're comfortable computing an xvector from. + # Note that the hard minimum is given by the left and right context of the + # frame-level layers. + min_chunk_size=20 + mkdir -p $nnet_dir/configs + cat < $nnet_dir/configs/network.xconfig + # please note that it is important to have input layer with the name=input + + # The frame-level layers + input dim=${feat_dim} name=input + relu-batchnorm-layer name=tdnn1 input=Append(-2,-1,0,1,2) dim=512 + relu-batchnorm-layer name=tdnn2 input=Append(-2,0,2) dim=512 + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn4 dim=512 + relu-batchnorm-layer name=tdnn5 dim=1500 + + # The stats pooling layer. Layers after this are segment-level. + # In the config below, the first and last argument (0, and ${max_chunk_size}) + # means that we pool over an input segment starting at frame 0 + # and ending at frame ${max_chunk_size} or earlier. The other arguments (1:1) + # mean that no subsampling is performed. + stats-layer name=stats config=mean+stddev(0:1:1:${max_chunk_size}) + + # This is where we usually extract the embedding (aka xvector) from. + relu-batchnorm-layer name=tdnn6 dim=128 input=stats + output-layer name=output include-log-softmax=true dim=${num_targets} +EOF + + steps/nnet3/xconfig_to_configs.py \ + --xconfig-file $nnet_dir/configs/network.xconfig \ + --config-dir $nnet_dir/configs/ + cp $nnet_dir/configs/final.config $nnet_dir/nnet.config + + # These three files will be used by sid/nnet3/xvector/extract_xvectors.sh + echo "output-node name=output input=tdnn6.affine" > $nnet_dir/extract.config + echo "$max_chunk_size" > $nnet_dir/max_chunk_size + echo "$min_chunk_size" > $nnet_dir/min_chunk_size +fi + +dropout_schedule='0,0@0.20,0.1@0.50,0' +srand=123 +if [ $stage -le 8 ]; then + steps/nnet3/train_raw_dnn.py --stage=$train_stage \ + --cmd="$train_cmd" \ + --trainer.optimization.proportional-shrink 10 \ + --trainer.optimization.momentum=0.5 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=8 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.minibatch-size=64 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2 \ + --trainer.num-epochs=3 \ + --trainer.dropout-schedule="$dropout_schedule" \ + --trainer.shuffle-buffer-size=1000 \ + --egs.frames-per-eg=1 \ + --egs.dir="$egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval=10 \ + --use-gpu=true \ + --dir=$nnet_dir || exit 1; +fi + +exit 0; diff --git a/egs/chime6/s5b_track2/local/prepare_data.sh b/egs/chime6/s5b_track2/local/prepare_data.sh new file mode 100755 index 00000000000..8bd2530d6db --- /dev/null +++ b/egs/chime6/s5b_track2/local/prepare_data.sh @@ -0,0 +1,149 @@ +#!/usr/bin/env bash +# +# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe, Yenda Trmal) +# Apache 2.0 + +# Begin configuration section. +mictype=worn # worn, ref or others +cleanup=true +train=true + +# End configuration section +. ./utils/parse_options.sh # accept options.. you can run this run.sh with the + +. ./path.sh + +echo >&2 "$0" "$@" +if [ $# -ne 3 ] ; then + echo >&2 "$0" "$@" + echo >&2 "$0: Error: wrong number of arguments" + echo -e >&2 "Usage:\n $0 [opts] " + echo -e >&2 "eg:\n $0 /corpora/chime5/audio/train /corpora/chime5/transcriptions/train data/train" + exit 1 +fi + +set -e -o pipefail + +adir=$1 +jdir=$2 +dir=$3 + +json_count=$(find -L $jdir -name "*.json" | wc -l) +wav_count=$(find -L $adir -name "*.wav" | wc -l) + +if [ "$json_count" -eq 0 ]; then + echo >&2 "We expect that the directory $jdir will contain json files." + echo >&2 "That implies you have supplied a wrong path to the data." + exit 1 +fi +if [ "$wav_count" -eq 0 ]; then + echo >&2 "We expect that the directory $adir will contain wav files." + echo >&2 "That implies you have supplied a wrong path to the data." + exit 1 +fi + +echo "$0: Converting transcription to text" + +mkdir -p $dir +for file in $jdir/*json; do + ./local/json2text.py --mictype $mictype $file +done | \ + sed -e "s/\[inaudible[- 0-9]*\]/[inaudible]/g" |\ + sed -e 's/ - / /g' |\ + sed -e 's/mm-/mm/g' > $dir/text.orig + +echo "$0: Creating datadir $dir for type=\"$mictype\"" + +if [ $mictype == "worn" ]; then + # convert the filenames to wav.scp format, use the basename of the file + # as a the wav.scp key, add .L and .R for left and right channel + # i.e. each file will have two entries (left and right channel) + find -L $adir -name "S[0-9]*_P[0-9]*.wav" | \ + perl -ne '{ + chomp; + $path = $_; + next unless $path; + @F = split "/", $path; + ($f = $F[@F-1]) =~ s/.wav//; + @F = split "_", $f; + print "${F[1]}_${F[0]}.L sox $path -t wav - remix 1 |\n"; + print "${F[1]}_${F[0]}.R sox $path -t wav - remix 2 |\n"; + }' | sort > $dir/wav.scp + + # generate the transcripts for both left and right channel + # from the original transcript in the form + # P09_S03-0006072-0006147 gimme the baker + # create left and right channel transcript + # P09_S03.L-0006072-0006147 gimme the baker + # P09_S03.R-0006072-0006147 gimme the baker + sed -n 's/ *$//; h; s/-/\.L-/p; g; s/-/\.R-/p' $dir/text.orig | sort > $dir/text +elif [ $mictype == "ref" ]; then + # fixed reference array + + # first get a text, which will be used to extract reference arrays + perl -ne 's/-/.ENH-/;print;' $dir/text.orig | sort > $dir/text + + find -L $adir | grep "\.wav" | sort > $dir/wav.flist + # following command provide the argument for grep to extract only reference arrays + #grep `cut -f 1 -d"-" $dir/text | awk -F"_" '{print $2 "_" $3}' | sed -e "s/\.ENH//" | sort | uniq | sed -e "s/^/ -e /" | tr "\n" " "` $dir/wav.flist > $dir/wav.flist2 + paste -d" " \ + <(awk -F "/" '{print $NF}' $dir/wav.flist | sed -e "s/\.wav/.ENH/") \ + $dir/wav.flist | sort > $dir/wav.scp +else + # array mic case + # convert the filenames to wav.scp format, use the basename of the file + # as a the wav.scp key + find -L $adir -name "*.wav" -ipath "*${mictype}*" |\ + perl -ne '$p=$_;chomp $_;@F=split "/";$F[$#F]=~s/\.wav//;print "$F[$#F] $p";' |\ + sort -u > $dir/wav.scp + + # convert the transcripts from + # P09_S03-0006072-0006147 gimme the baker + # to the per-channel transcripts + # P09_S03_U01_NOLOCATION.CH1-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH2-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH3-0006072-0006147 gimme the baker + # P09_S03_U01_NOLOCATION.CH4-0006072-0006147 gimme the baker + perl -ne '$l=$_; + for($i=1; $i<=4; $i++) { + ($x=$l)=~ s/-/.CH\Q$i\E-/; + print $x;}' $dir/text.orig | sort > $dir/text + +fi +$cleanup && rm -f $dir/text.* $dir/wav.scp.* $dir/wav.flist + +# Prepare 'segments', 'utt2spk', 'spk2utt' +if [ $mictype == "worn" ]; then + cut -d" " -f 1 $dir/text | \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/_[A-Z]*\././2" \ + > $dir/segments +elif [ $mictype == "ref" ]; then + cut -d" " -f 1 $dir/text | \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/_[A-Z]*\././2" |\ + sed -e "s/ P.._/ /" > $dir/segments +else + cut -d" " -f 1 $dir/text | \ + awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\ + sed -e "s/_[A-Z]*\././2" |\ + sed -e 's/ P.._/ /' > $dir/segments +fi +cut -f 1 -d ' ' $dir/segments | \ + perl -ne 'chomp;$utt=$_;s/_.*//;print "$utt $_\n";' > $dir/utt2spk + +utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt + +if [ $train != 'true' ]; then + # For scoring the final system, we need the original utt2spk + # and text file. So we keep them with the extension .bak here + # so that they don't affect the validate_data_dir steps in + # the intermediate steps. + for file in text utt2spk spk2utt segments; do + mv $dir/$file $dir/$file.bak + done + + # For dev and eval data, prepare pseudo utt2spk. + awk '{print $1, $1}' $dir/wav.scp > $dir/utt2spk + utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt +fi diff --git a/egs/chime6/s5b_track2/local/prepare_dict.sh b/egs/chime6/s5b_track2/local/prepare_dict.sh new file mode 120000 index 00000000000..ada30947463 --- /dev/null +++ b/egs/chime6/s5b_track2/local/prepare_dict.sh @@ -0,0 +1 @@ +../../s5_track1/local/prepare_dict.sh \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/print_dset_error.py b/egs/chime6/s5b_track2/local/print_dset_error.py new file mode 100755 index 00000000000..8ffe930f4f6 --- /dev/null +++ b/egs/chime6/s5b_track2/local/print_dset_error.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +# Copyright 2019 Ashish Arora +# Apache 2.0. + +import sys, io +import string +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') +spkorder_writer = open(sys.argv[1],'w', encoding='utf8') +array_id_error_dict={} +for line in infile: + toks = line.strip().split() + recordingid = toks[1] + total_words = toks[-5][:-1] + total_errors = toks[-4][:-1] + total_ins = toks[-3][:-1] + total_del = toks[-2][:-1] + total_sub = toks[-1] + spk_order = toks[6][1] + '_' + toks[7][0] + '_' + toks[8][0] + '_' + toks[9][0] + spkorder_writer.write(recordingid + ':' + spk_order + '\n') + arrayid=recordingid.strip().split('_')[1] + if arrayid not in array_id_error_dict: + array_id_error_dict[arrayid]=[0]*5 + array_id_error_dict[arrayid][0]+=int(total_words) + array_id_error_dict[arrayid][1]+=int(total_errors) + array_id_error_dict[arrayid][2]+=int(total_ins) + array_id_error_dict[arrayid][3]+=int(total_del) + array_id_error_dict[arrayid][4]+=int(total_sub) + + +for arrayid in sorted(array_id_error_dict): + wer = float(array_id_error_dict[arrayid][1])/float(array_id_error_dict[arrayid][0])*100 + wer_detail = "%WER {0:5.2f} [ {1} / {2}, {3} ins, {4} del, {5} sub ]".format(wer, array_id_error_dict[arrayid][1], array_id_error_dict[arrayid][0], array_id_error_dict[arrayid][2], array_id_error_dict[arrayid][3], array_id_error_dict[arrayid][4]) + output.write(arrayid + ' ' + wer_detail + '\n') + diff --git a/egs/chime6/s5b_track2/local/reverberate_lat_dir.sh b/egs/chime6/s5b_track2/local/reverberate_lat_dir.sh new file mode 120000 index 00000000000..57302268f6d --- /dev/null +++ b/egs/chime6/s5b_track2/local/reverberate_lat_dir.sh @@ -0,0 +1 @@ +../../s5_track1/local/reverberate_lat_dir.sh \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/run_beamformit.sh b/egs/chime6/s5b_track2/local/run_beamformit.sh new file mode 120000 index 00000000000..832a16e3ba7 --- /dev/null +++ b/egs/chime6/s5b_track2/local/run_beamformit.sh @@ -0,0 +1 @@ +../../s5_track1/local/run_beamformit.sh \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/run_ivector_common.sh b/egs/chime6/s5b_track2/local/run_ivector_common.sh new file mode 120000 index 00000000000..df7fca84335 --- /dev/null +++ b/egs/chime6/s5b_track2/local/run_ivector_common.sh @@ -0,0 +1 @@ +../../s5_track1/local/nnet3/run_ivector_common.sh \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/run_wpe.py b/egs/chime6/s5b_track2/local/run_wpe.py new file mode 120000 index 00000000000..6621607c932 --- /dev/null +++ b/egs/chime6/s5b_track2/local/run_wpe.py @@ -0,0 +1 @@ +../../s5_track1/local/run_wpe.py \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/run_wpe.sh b/egs/chime6/s5b_track2/local/run_wpe.sh new file mode 120000 index 00000000000..187080e62e4 --- /dev/null +++ b/egs/chime6/s5b_track2/local/run_wpe.sh @@ -0,0 +1 @@ +../../s5_track1/local/run_wpe.sh \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/score.sh b/egs/chime6/s5b_track2/local/score.sh new file mode 120000 index 00000000000..6a200b42ed3 --- /dev/null +++ b/egs/chime6/s5b_track2/local/score.sh @@ -0,0 +1 @@ +../steps/scoring/score_kaldi_wer.sh \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/score_for_submit.sh b/egs/chime6/s5b_track2/local/score_for_submit.sh new file mode 100755 index 00000000000..71a3a4dd607 --- /dev/null +++ b/egs/chime6/s5b_track2/local/score_for_submit.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash +# Apache 2.0 +# +# This script provides CHiME-6 challenge track 2 submission scores. +# It calculates the best search parameter configurations by using the dev set +# and provides wer for dev and eval + +cmd=run.pl +stage=0 +word_ins_penalty=0.0,0.5,1.0 +min_lmwt=7 +max_lmwt=17 +dev_decodedir=exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_dereverb_diarized_2stage +eval_decodedir=exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_eval_beamformit_dereverb_diarized_2stage +dev_datadir=dev_beamformit_dereverb_diarized_hires +eval_datadir=eval_beamformit_dereverb_diarized_hires + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh +. ./cmd.sh +. ./utils/parse_options.sh + +if [ $# -ne 0 ]; then + echo "Usage: $0 [--cmd (run.pl|queue.pl...)]" + echo "This script provides CHiME-6 challenge submission scores" + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --dev_decodedir # dev set decoding directory" + echo " --eval_decodedir # eval set decoding directory" + echo " --dev_datadir # dev set data directory" + echo " --eval_datadir # eval set data directory" + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + + exit 1; +fi + +if [ $stage -le 1 ]; then + # obtaining multi speaker WER for all lmwt and wip + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + for LMWT in $(seq $min_lmwt $max_lmwt); do + local/multispeaker_score.sh --cmd "$cmd" \ + --datadir $dev_datadir --get_stats false data/$dev_datadir/text \ + $dev_decodedir/scoring_kaldi/penalty_$wip/$LMWT.txt \ + $dev_decodedir/scoring_kaldi_multispeaker/penalty_$wip/$LMWT + done + done +fi + +if [ $stage -le 2 ]; then + # obtaining best lmwt, wip and wer + # adding /dev/null to the command list below forces grep to output the filename + mkdir -p $dev_decodedir/scoring_kaldi_multispeaker + grep WER $dev_decodedir/scoring_kaldi_multispeaker/penalty_*/*/per_speaker_wer/array_wer.txt /dev/null \ + | utils/best_wer.sh >& $dev_decodedir/scoring_kaldi_multispeaker/best_wer + + best_wer_file=$(awk '{print $NF}' $dev_decodedir/scoring_kaldi_multispeaker/best_wer) + best_array=$(echo $best_wer_file | awk -F: '{N=NF; print $N}') + best_lmwt=$(echo $best_wer_file | awk -F/ '{N=NF-2; print $N}') + best_wip=$(echo $best_wer_file | awk -F_ '{N=NF-3; print $N}' | awk -F/ '{N=NF-2; print $N}') + + # printing and storing best lmwt, best_array and wip + echo "best array: $best_array" + echo "best LM weight: $best_lmwt" + echo "best insertion penalty weight: $best_wip" + + echo $best_lmwt > $dev_decodedir/scoring_kaldi_multispeaker/lmwt + echo $best_wip > $dev_decodedir/scoring_kaldi_multispeaker/wip + echo $best_array > $dev_decodedir/scoring_kaldi_multispeaker/best_array +fi + +if [ $stage -le 3 ]; then + # obtaining per utterance stats for dev + local/multispeaker_score.sh --cmd "$cmd" \ + --datadir $dev_datadir data/$dev_datadir/text \ + $dev_decodedir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \ + $dev_decodedir/scoring_kaldi_multispeaker/penalty_$best_wip/$best_lmwt/ +fi + +if [ $stage -le 4 ]; then + # obtaining per utterance stats for eval + local/multispeaker_score.sh --cmd "$cmd" \ + --datadir $eval_datadir data/$eval_datadir/text \ + $eval_decodedir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \ + $eval_decodedir/scoring_kaldi_multispeaker/penalty_$best_wip/$best_lmwt/ +fi + +if [ $stage -le 5 ]; then + # obtaining eval wer corresponding to best lmwt, best_array and wip of dev + best_array="$(cat $dev_decodedir/scoring_kaldi_multispeaker/best_array)" + best_lmwt="$(cat $dev_decodedir/scoring_kaldi_multispeaker/lmwt)" + best_wip="$(cat $dev_decodedir/scoring_kaldi_multispeaker/wip)" + + grep WER $eval_decodedir/scoring_kaldi_multispeaker/penalty_$best_wip/$best_lmwt/per_speaker_wer/array_wer.txt /dev/null \ + | grep $best_array | utils/best_wer.sh >& $eval_decodedir/scoring_kaldi_multispeaker/best_wer + + # printing dev and eval wer + echo "Dev: $(<$dev_decodedir/scoring_kaldi_multispeaker/best_wer)" | cut -d " " -f 1-15 + echo "Eval: $(<$eval_decodedir/scoring_kaldi_multispeaker/best_wer)" | cut -d " " -f 1-14 +fi + diff --git a/egs/chime6/s5b_track2/local/segmentation/detect_speech_activity.sh b/egs/chime6/s5b_track2/local/segmentation/detect_speech_activity.sh new file mode 100755 index 00000000000..c9719d472f3 --- /dev/null +++ b/egs/chime6/s5b_track2/local/segmentation/detect_speech_activity.sh @@ -0,0 +1,217 @@ +#!/usr/bin/env bash + +# Copyright 2016-17 Vimal Manohar +# 2017 Nagendra Kumar Goel +# Apache 2.0. + +# This script does nnet3-based speech activity detection given an input +# kaldi data directory and outputs a segmented kaldi data directory. +# This script can also do music detection and other similar segmentation +# using appropriate options such as --output-name output-music. + +set -e +set -o pipefail +set -u + +if [ -f ./path.sh ]; then . ./path.sh; fi + +affix= # Affix for the segmentation +nj=32 +cmd=queue.pl +stage=-1 + +# Feature options (Must match training) +mfcc_config=conf/mfcc_hires.conf +feat_affix= # Affix for the type of feature used + +output_name=output # The output node in the network +sad_name=sad # Base name for the directory storing the computed loglikes + # Can be music for music detection +segmentation_name=segmentation # Base name for the directory doing segmentation + # Can be segmentation_music for music detection + +# SAD network config +iter=final # Model iteration to use + +# Contexts must ideally match training for LSTM models, but +# may not necessarily for stats components +extra_left_context=0 # Set to some large value, typically 40 for LSTM (must match training) +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 +frames_per_chunk=150 + +# Decoding options +graph_opts="--min-silence-duration=0.03 --min-speech-duration=0.3 --max-speech-duration=10.0" +acwt=0.3 + +# These _in__weight represent the fraction of probability +# to transfer to class. +# e.g. --speech-in-sil-weight=0.0 --garbage-in-sil-weight=0.0 --sil-in-speech-weight=0.0 --garbage-in-speech-weight=0.3 +transform_probs_opts="" + +# Postprocessing options +segment_padding=0.2 # Duration (in seconds) of padding added to segments +min_segment_dur=0 # Minimum duration (in seconds) required for a segment to be included + # This is before any padding. Segments shorter than this duration will be removed. + # This is an alternative to --min-speech-duration above. +merge_consecutive_max_dur=0 # Merge consecutive segments as long as the merged segment is no longer than this many + # seconds. The segments are only merged if their boundaries are touching. + # This is after padding by --segment-padding seconds. + # 0 means do not merge. Use 'inf' to not limit the duration. + +echo $* + +. utils/parse_options.sh + +if [ $# -ne 5 ]; then + echo "This script does nnet3-based speech activity detection given an input kaldi " + echo "data directory and outputs an output kaldi data directory." + echo "See script for details of the options to be supplied." + echo "Usage: $0 " + echo " e.g.: $0 ~/workspace/egs/ami/s5b/data/sdm1/dev exp/nnet3_sad_snr/nnet_tdnn_j_n4 \\" + echo " mfcc_hires exp/segmentation_sad_snr/nnet_tdnn_j_n4 data/ami_sdm1_dev" + echo "" + echo "Options: " + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --nj # number of parallel jobs to run." + echo " --stage # stage to do partial re-run from." + echo " --convert-data-dir-to-whole # If true, the input data directory is " + echo " # first converted to whole data directory (i.e. whole recordings) " + echo " # and segmentation is done on that." + echo " # If false, then the original segments are " + echo " # retained and they are split into sub-segments." + echo " --output-name # The output node in the network" + echo " --extra-left-context # Set to some large value, typically 40 for LSTM (must match training)" + echo " --extra-right-context # For BLSTM or statistics pooling" + exit 1 +fi + +src_data_dir=$1 # The input data directory that needs to be segmented. + # If convert_data_dir_to_whole is true, any segments in that will be ignored. +sad_nnet_dir=$2 # The SAD neural network +mfcc_dir=$3 # The directory to store the features +dir=$4 # Work directory +data_dir=$5 # The output data directory will be ${data_dir}_seg + +affix=${affix:+_$affix} +feat_affix=${feat_affix:+_$feat_affix} + +data_id=`basename $data_dir` +sad_dir=${dir}/${sad_name}${affix}_${data_id}${feat_affix} +seg_dir=${dir}/${segmentation_name}${affix}_${data_id}${feat_affix} +test_data_dir=data/${data_id}${feat_affix} + +############################################################################### +## Forward pass through the network network and dump the log-likelihoods. +############################################################################### + +frame_subsampling_factor=1 +if [ -f $sad_nnet_dir/frame_subsampling_factor ]; then + frame_subsampling_factor=$(cat $sad_nnet_dir/frame_subsampling_factor) +fi + +mkdir -p $dir +if [ $stage -le 1 ]; then + if [ "$(readlink -f $sad_nnet_dir)" != "$(readlink -f $dir)" ]; then + cp $sad_nnet_dir/cmvn_opts $dir || exit 1 + fi + + ######################################################################## + ## Initialize neural network for decoding using the output $output_name + ######################################################################## + + if [ ! -z "$output_name" ] && [ "$output_name" != output ]; then + $cmd $dir/log/get_nnet_${output_name}.log \ + nnet3-copy --edits="rename-node old-name=$output_name new-name=output" \ + $sad_nnet_dir/$iter.raw $dir/${iter}_${output_name}.raw || exit 1 + iter=${iter}_${output_name} + else + if ! diff $sad_nnet_dir/$iter.raw $dir/$iter.raw; then + cp $sad_nnet_dir/$iter.raw $dir/ + fi + fi + + steps/nnet3/compute_output.sh --nj $nj --cmd "$cmd" \ + --iter ${iter} \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk $frames_per_chunk --apply-exp true \ + --frame-subsampling-factor $frame_subsampling_factor \ + ${test_data_dir} $dir $sad_dir || exit 1 +fi + +############################################################################### +## Prepare FST we search to make speech/silence decisions. +############################################################################### + +utils/data/get_utt2dur.sh --nj $nj --cmd "$cmd" $test_data_dir || exit 1 +frame_shift=$(utils/data/get_frame_shift.sh $test_data_dir) || exit 1 + +graph_dir=${dir}/graph_${output_name} +if [ $stage -le 2 ]; then + mkdir -p $graph_dir + + # 1 for silence and 2 for speech + cat < $graph_dir/words.txt + 0 +silence 1 +speech 2 +EOF + + $cmd $graph_dir/log/make_graph.log \ + steps/segmentation/internal/prepare_sad_graph.py $graph_opts \ + --frame-shift=$(perl -e "print $frame_shift * $frame_subsampling_factor") - \| \ + fstcompile --isymbols=$graph_dir/words.txt --osymbols=$graph_dir/words.txt '>' \ + $graph_dir/HCLG.fst +fi + +############################################################################### +## Do Viterbi decoding to create per-frame alignments. +############################################################################### + +post_vec=$sad_nnet_dir/post_${output_name}.vec +if [ ! -f $sad_nnet_dir/post_${output_name}.vec ]; then + if [ ! -f $sad_nnet_dir/post_${output_name}.txt ]; then + echo "$0: Could not find $sad_nnet_dir/post_${output_name}.vec. " + echo "Re-run the corresponding stage in the training script possibly " + echo "with --compute-average-posteriors=true or compute the priors " + echo "from the training labels" + exit 1 + else + post_vec=$sad_nnet_dir/post_${output_name}.txt + fi +fi + +mkdir -p $seg_dir +if [ $stage -le 3 ]; then + steps/segmentation/internal/get_transform_probs_mat.py \ + --priors="$post_vec" $transform_probs_opts > $seg_dir/transform_probs.mat + + steps/segmentation/decode_sad.sh --acwt $acwt --cmd "$cmd" \ + --nj $nj \ + --transform "$seg_dir/transform_probs.mat" \ + $graph_dir $sad_dir $seg_dir +fi + +############################################################################### +## Post-process segmentation to create kaldi data directory. +############################################################################### + +if [ $stage -le 4 ]; then + steps/segmentation/post_process_sad_to_segments.sh \ + --segment-padding $segment_padding --min-segment-dur $min_segment_dur \ + --merge-consecutive-max-dur $merge_consecutive_max_dur \ + --cmd "$cmd" --frame-shift $(perl -e "print $frame_subsampling_factor * $frame_shift") \ + ${test_data_dir} ${seg_dir} ${seg_dir} +fi + +if [ $stage -le 5 ]; then + utils/data/subsegment_data_dir.sh ${test_data_dir} ${seg_dir}/segments \ + ${data_dir}_seg +fi + +echo "$0: Created output segmented kaldi data directory in ${data_dir}_seg" +exit 0 diff --git a/egs/chime6/s5b_track2/local/segmentation/tuning/train_lstm_sad_1a.sh b/egs/chime6/s5b_track2/local/segmentation/tuning/train_lstm_sad_1a.sh new file mode 100755 index 00000000000..7ea39f45639 --- /dev/null +++ b/egs/chime6/s5b_track2/local/segmentation/tuning/train_lstm_sad_1a.sh @@ -0,0 +1,140 @@ +#!/usr/bin/env bash + +# Copyright 2017 Nagendra Kumar Goel +# 2018 Vimal Manohar +# Apache 2.0 + +# This is a script to train a TDNN for speech activity detection (SAD) +# using LSTM for long-context information. + +stage=0 +train_stage=-10 +get_egs_stage=-10 +egs_opts= + +chunk_width=20 + +extra_left_context=60 +extra_right_context=10 +relu_dim=256 +cell_dim=256 +projection_dim=64 + +# training options +num_epochs=1 +initial_effective_lrate=0.0003 +final_effective_lrate=0.00003 +num_jobs_initial=3 +num_jobs_final=8 +remove_egs=true +max_param_change=0.2 # Small max-param change for small network +dropout_schedule='0,0@0.20,0.1@0.50,0' + +egs_dir= +nj=40 + +dir= +affix=1a + +data_dir= +targets_dir= + +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi +. ./utils/parse_options.sh + +set -o pipefail +set -u + +if [ -z "$dir" ]; then + dir=exp/segmentation_1a/tdnn_lstm_asr_sad +fi +dir=$dir${affix:+_$affix} + +if ! cuda-compiled; then + cat < $dir/cmvn_opts + +if [ $stage -le 5 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=`feat-to-dim scp:$data_dir/feats.scp -` name=input + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2) affine-transform-file=$dir/configs/lda.mat + + relu-renorm-layer name=tdnn1 input=lda dim=$relu_dim add-log-stddev=true + relu-renorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$relu_dim add-log-stddev=true + relu-renorm-layer name=tdnn3 input=Append(-3,0,3,6) dim=$relu_dim add-log-stddev=true + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim decay-time=20 delay=-3 dropout-proportion=0.0 + relu-renorm-layer name=tdnn4 input=Append(-6,0,6,12) add-log-stddev=true dim=$relu_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim decay-time=20 delay=-3 dropout-proportion=0.0 + relu-renorm-layer name=tdnn5 input=Append(-12,0,12,24) dim=$relu_dim + + output-layer name=output include-log-softmax=true dim=3 learning-rate-factor=0.1 input=tdnn5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig \ + --config-dir $dir/configs/ + + cat <> $dir/configs/vars +num_targets=3 +EOF +fi + +if [ $stage -le 6 ]; then + num_utts=`cat $data_dir/utt2spk | wc -l` + # Set num_utts_subset for diagnostics to a reasonable value + # of max(min(0.005 * num_utts, 300), 12) + num_utts_subset=`perl -e '$n=int($ARGV[0] * 0.005); print ($n > 300 ? 300 : ($n < 12 ? 12 : $n))' $num_utts` + + steps/nnet3/train_raw_rnn.py --stage=$train_stage \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$egs_dir" --egs.stage=$get_egs_stage \ + --egs.chunk-left-context=$extra_left_context \ + --egs.chunk-right-context=$extra_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --trainer.num-epochs=$num_epochs \ + --trainer.samples-per-iter=20000 \ + --trainer.optimization.num-jobs-initial=$num_jobs_initial \ + --trainer.optimization.num-jobs-final=$num_jobs_final \ + --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \ + --trainer.optimization.final-effective-lrate=$final_effective_lrate \ + --trainer.optimization.shrink-value=0.99 \ + --trainer.dropout-schedule="$dropout_schedule" \ + --trainer.rnn.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.5 \ + --trainer.deriv-truncate-margin=10 \ + --trainer.max-param-change=$max_param_change \ + --trainer.compute-per-dim-accuracy=true \ + --cmd="$decode_cmd" --nj $nj \ + --cleanup=true \ + --cleanup.remove-egs=$remove_egs \ + --cleanup.preserve-model-interval=10 \ + --use-gpu=true \ + --use-dense-targets=true \ + --feat-dir=$data_dir \ + --targets-scp="$targets_dir/targets.scp" \ + --egs.opts="--frame-subsampling-factor 3 --num-utts-subset $num_utts_subset" \ + --dir=$dir || exit 1 +fi + +if [ $stage -le 7 ]; then + # Use a subset to compute prior over the output targets + $train_cmd $dir/log/get_priors.log \ + matrix-sum-rows "scp:utils/subset_scp.pl --quiet 1000 $targets_dir/targets.scp |" \ + ark:- \| vector-sum --binary=false ark:- $dir/post_output.vec || exit 1 + + echo 3 > $dir/frame_subsampling_factor +fi diff --git a/egs/chime6/s5b_track2/local/segmentation/tuning/train_stats_sad_1a.sh b/egs/chime6/s5b_track2/local/segmentation/tuning/train_stats_sad_1a.sh new file mode 100755 index 00000000000..83bcd587d88 --- /dev/null +++ b/egs/chime6/s5b_track2/local/segmentation/tuning/train_stats_sad_1a.sh @@ -0,0 +1,150 @@ +#!/usr/bin/env bash + +# Copyright 2017 Nagendra Kumar Goel +# 2018 Vimal Manohar +# Apache 2.0 + +# This is a script to train a TDNN for speech activity detection (SAD) +# using statistics pooling for long-context information. + +stage=0 +train_stage=-10 +get_egs_stage=-10 +egs_opts= + +chunk_width=20 + +# The context is chosen to be around 1 second long. The context at test time +# is expected to be around the same. +extra_left_context=79 +extra_right_context=21 + +relu_dim=256 + +# training options +num_epochs=1 +initial_effective_lrate=0.0003 +final_effective_lrate=0.00003 +num_jobs_initial=3 +num_jobs_final=8 +remove_egs=true +max_param_change=0.2 # Small max-param change for small network + +egs_dir= +nj=40 + +dir= +affix=1a + +data_dir= +targets_dir= + +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi +. ./utils/parse_options.sh + +set -o pipefail +set -u + +if [ -z "$dir" ]; then + dir=exp/segmentation_1a/tdnn_stats_sad +fi +dir=$dir${affix:+_$affix} + +if ! cuda-compiled; then + cat < $dir/cmvn_opts + +if [ $stage -le 5 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=`feat-to-dim scp:$data_dir/feats.scp -` name=input + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2) affine-transform-file=$dir/configs/lda.mat + + relu-renorm-layer name=tdnn1 input=lda dim=$relu_dim add-log-stddev=true + relu-renorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$relu_dim add-log-stddev=true + relu-renorm-layer name=tdnn3 input=Append(-3,0,3,6) dim=$relu_dim add-log-stddev=true + stats-layer name=tdnn3_stats config=mean+count(-99:3:9:99) + relu-renorm-layer name=tdnn4 input=Append(tdnn3@-6,tdnn3@0,tdnn3@6,tdnn3@12,tdnn3_stats) add-log-stddev=true dim=$relu_dim + stats-layer name=tdnn4_stats config=mean+count(-108:6:18:108) + relu-renorm-layer name=tdnn5 input=Append(tdnn4@-12,tdnn4@0,tdnn4@12,tdnn4@24,tdnn4_stats) dim=$relu_dim + + output-layer name=output include-log-softmax=true dim=3 learning-rate-factor=0.1 input=tdnn5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig \ + --config-dir $dir/configs/ + + cat <> $dir/configs/vars +num_targets=3 +EOF +fi + +if [ $stage -le 6 ]; then + num_utts=`cat $data_dir/utt2spk | wc -l` + # Set num_utts_subset for diagnostics to a reasonable value + # of max(min(0.005 * num_utts, 300), 12) + num_utts_subset=`perl -e '$n=int($ARGV[0] * 0.005); print ($n > 300 ? 300 : ($n < 12 ? 12 : $n))' $num_utts` + + steps/nnet3/train_raw_rnn.py --stage=$train_stage \ + --feat.cmvn-opts=$cmvn_opts \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$egs_dir" --egs.stage=$get_egs_stage \ + --egs.chunk-left-context=$extra_left_context \ + --egs.chunk-right-context=$extra_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --trainer.num-epochs=$num_epochs \ + --trainer.samples-per-iter=20000 \ + --trainer.optimization.num-jobs-initial=$num_jobs_initial \ + --trainer.optimization.num-jobs-final=$num_jobs_final \ + --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \ + --trainer.optimization.final-effective-lrate=$final_effective_lrate \ + --trainer.rnn.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.5 \ + --trainer.deriv-truncate-margin=10 \ + --trainer.max-param-change=$max_param_change \ + --trainer.compute-per-dim-accuracy=true \ + --cmd="$decode_cmd" --nj $nj \ + --cleanup=true \ + --cleanup.remove-egs=$remove_egs \ + --cleanup.preserve-model-interval=10 \ + --use-gpu=true \ + --use-dense-targets=true \ + --feat-dir=$data_dir \ + --targets-scp="$targets_dir/targets.scp" \ + --egs.opts="--frame-subsampling-factor 3 --num-utts-subset $num_utts_subset" \ + --dir=$dir || exit 1 +fi + +if [ $stage -le 7 ]; then + # Use a subset to compute prior over the output targets + #$train_cmd $dir/log/get_priors.log \ + # matrix-sum-rows "scp:utils/subset_scp.pl --quiet 1000 $targets_dir/targets.scp |" \ + # ark:- \| vector-sum --binary=false ark:- $dir/post_output.vec || exit 1 + + # Since the train data is individual microphones, while the dev and + # eval are beamformed, it is likely that the train contains a much + # higher ratio of silences. So using priors computed from the train + # data may miss a lot of speech in the dev/eval sets. Hence we manually + # tune the prior on the dev set. + # With the following prior, the SAD system results are: + # Dev (using -c 0.25) + # MISSED SPEECH = 1188.59 secs ( 3.3 percent of scored time) + # FALARM SPEECH = 539.37 secs ( 1.5 percent of scored time) + echo "[ 30 2 1 ]" > $dir/post_output.vec || exit 1 + + echo 3 > $dir/frame_subsampling_factor +fi + diff --git a/egs/chime6/s5b_track2/local/train_diarizer.sh b/egs/chime6/s5b_track2/local/train_diarizer.sh new file mode 100755 index 00000000000..845ac7840d5 --- /dev/null +++ b/egs/chime6/s5b_track2/local/train_diarizer.sh @@ -0,0 +1,186 @@ +#!/usr/bin/env bash +# Copyright +# 2019 David Snyder +# Apache 2.0. +# +# This script is based on the run.sh script in the Voxceleb v2 recipe. +# It trains an x-vector DNN for diarization. + +mfccdir=`pwd`/mfcc +vaddir=`pwd`/mfcc + +voxceleb1_root=/export/corpora/VoxCeleb1 +voxceleb2_root=/export/corpora/VoxCeleb2 +data_dir=train_worn_simu_u400k +model_dir=exp/xvector_nnet_1a + +stage=0 +train_stage=-1 + +. ./cmd.sh + +if [ -f ./path.sh ]; then . ./path.sh; fi +set -e -u -o pipefail +. utils/parse_options.sh + +if [ $# -ne 0 ]; then + exit 1 +fi + +if [ $stage -le 0 ]; then + echo "$0: preparing voxceleb 2 data" + local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train + local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test + + echo "$0: preparing voxceleb 1 data (see comments if this step fails)" + # The format of the voxceleb 1 corpus has changed several times since it was + # released. Therefore, our dataprep scripts may or may not fail depending + # on the version of the corpus you obtained. + # If you downloaded the corpus soon after it was first released, this + # version of the dataprep script might work: + local/make_voxceleb1.pl $voxceleb1_root data/voxceleb1 + # However, if you've downloaded the corpus recently, you may need to use the + # the following scripts instead: + #local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train + #local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test + + # We should now have about 7,351 speakers and 1,277,503 utterances. + utils/combine_data.sh data/voxceleb data/voxceleb2_train data/voxceleb2_test +fi + +if [ $stage -le 1 ]; then + echo "$0: preparing features for training data (voxceleb 1 + 2)" + steps/make_mfcc.sh --write-utt2num-frames true \ + --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \ + data/voxceleb exp/make_mfcc $mfccdir + utils/fix_data_dir.sh data/voxceleb + # Note that we apply CMN to the MFCCs and write these to the disk. These + # features will later be used to train the x-vector DNN. +fi + +# In this section, we augment the voxceleb data with reverberation. +# Note that we can probably improve the x-vector DNN if we include +# augmentations from the nonspeech regions of the Chime 6 training +# dataset. +if [ $stage -le 2 ]; then + echo "$0: applying augmentation to x-vector training data (just reverb for now)" + frame_shift=0.01 + awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/voxceleb/utt2num_frames > data/voxceleb/reco2dur + + if [ ! -d "RIRS_NOISES" ]; then + echo "$0: downloading simulated room impulse response dataset" + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + + # Make a version with reverberated speech + rvb_opts=() + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + + # Make a reverberated version of the training data. Note that we don't add any + # additive noise here. + steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --speech-rvb-probability 1 \ + --pointsource-noise-addition-probability 0 \ + --isotropic-noise-addition-probability 0 \ + --num-replications 1 \ + --source-sampling-rate 16000 \ + data/voxceleb data/voxceleb_reverb + utils/copy_data_dir.sh --utt-suffix "-reverb" data/voxceleb_reverb data/voxceleb_reverb.new + rm -rf data/voxceleb_reverb + mv data/voxceleb_reverb.new data/voxceleb_reverb +fi + +if [ $stage -le 3 ]; then + echo "$0: making MFCCs for augmented training data" + # Make MFCCs for the augmented data. Note that we do not compute a new + # vad.scp file here. Instead, we use the vad.scp from the clean version of + # the list. + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \ + data/voxceleb_reverb exp/make_mfcc $mfccdir + # Combine the clean and augmented training data. This is now roughly + # double the size of the original clean list. + utils/combine_data.sh data/voxceleb_combined data/voxceleb_reverb data/voxceleb +fi + +# Now we prepare the features to generate examples for xvector training. +if [ $stage -le 4 ]; then + # This script applies CMVN and removes nonspeech frames. Note that this is somewhat + # wasteful, as it roughly doubles the amount of training data on disk. After + # creating voxceleb examples, this can be removed. + echo "$0: preparing features to train x-vector DNN" + local/nnet3/xvector/prepare_feats.sh --nj 40 --cmd "$train_cmd" \ + data/voxceleb_combined data/voxceleb_combined_cmn exp/voxceleb_combined_cmn + utils/fix_data_dir.sh data/voxceleb_combined_cmn +fi + +if [ $stage -le 5 ]; then + # Now, we need to remove features that are too short after removing silence + # frames. We want at least 4s (400 frames) per utterance. + min_len=400 + mv data/voxceleb_combined_cmn/utt2num_frames data/voxceleb_combined_cmn/utt2num_frames.bak + awk -v min_len=${min_len} '$2 > min_len {print $1, $2}' data/voxceleb_combined_cmn/utt2num_frames.bak > data/voxceleb_combined_cmn/utt2num_frames + utils/filter_scp.pl data/voxceleb_combined_cmn/utt2num_frames data/voxceleb_combined_cmn/utt2spk > data/voxceleb_combined_cmn/utt2spk.new + mv data/voxceleb_combined_cmn/utt2spk.new data/voxceleb_combined_cmn/utt2spk + utils/fix_data_dir.sh data/voxceleb_combined_cmn + + # We also want several utterances per speaker. Now we'll throw out speakers + # with fewer than 8 utterances. + min_num_utts=8 + awk '{print $1, NF-1}' data/voxceleb_combined_cmn/spk2utt > data/voxceleb_combined_cmn/spk2num + awk -v min_num_utts=${min_num_utts} '$2 >= min_num_utts {print $1, $2}' data/voxceleb_combined_cmn/spk2num | utils/filter_scp.pl - data/voxceleb_combined_cmn/spk2utt > data/voxceleb_combined_cmn/spk2utt.new + mv data/voxceleb_combined_cmn/spk2utt.new data/voxceleb_combined_cmn/spk2utt + utils/spk2utt_to_utt2spk.pl data/voxceleb_combined_cmn/spk2utt > data/voxceleb_combined_cmn/utt2spk + + utils/filter_scp.pl data/voxceleb_combined_cmn/utt2spk data/voxceleb_combined_cmn/utt2num_frames > data/voxceleb_combined_cmn/utt2num_frames.new + mv data/voxceleb_combined_cmn/utt2num_frames.new data/voxceleb_combined_cmn/utt2num_frames + + utils/fix_data_dir.sh data/voxceleb_combined_cmn +fi + +# Stages 6 through 8 are handled in run_xvector.sh. +# This script trains the x-vector DNN on the augmented voxceleb data. +local/nnet3/xvector/run_xvector.sh --stage $stage --train-stage $train_stage \ + --data data/voxceleb_combined_cmn --nnet-dir $model_dir \ + --egs-dir $model_dir/egs + +if [ $stage -le 9 ]; then + echo "$0: preparing a subset of Chime 6 training data to train PLDA model" + utils/subset_data_dir.sh ${data_dir} 100000 data/plda_train + steps/make_mfcc.sh --write-utt2num-frames true \ + --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \ + data/plda_train exp/make_mfcc $mfccdir + utils/fix_data_dir.sh data/plda_train + local/nnet3/xvector/prepare_feats.sh --nj 40 --cmd "$train_cmd" \ + data/plda_train data/plda_train_cmn exp/plda_train_cmn + if [ -f data/plda_train/segments ]; then + cp data/plda_train/segments data/plda_train_cmn/ + fi +fi + +if [ $stage -le 10 ]; then + echo "$0: extracting x-vector for PLDA training data" + utils/fix_data_dir.sh data/plda_train_cmn + diarization/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 10G" \ + --nj 40 --window 3.0 --period 10.0 --min-segment 1.5 --apply-cmn false \ + --hard-min true $model_dir \ + data/plda_train_cmn $model_dir/xvectors_plda_train +fi + +# Train PLDA models +if [ $stage -le 11 ]; then + echo "$0: training PLDA model" + $train_cmd $model_dir/xvectors_plda_train/log/plda.log \ + ivector-compute-plda ark:$model_dir/xvectors_plda_train/spk2utt \ + "ark:ivector-subtract-global-mean \ + scp:$model_dir/xvectors_plda_train/xvector.scp ark:- \ + | transform-vec $model_dir/xvectors_plda_train/transform.mat ark:- ark:- \ + | ivector-normalize-length ark:- ark:- |" \ + $model_dir/xvectors_plda_train/plda || exit 1; + cp $model_dir/xvectors_plda_train/plda $model_dir/ + cp $model_dir/xvectors_plda_train/transform.mat $model_dir/ + cp $model_dir/xvectors_plda_train/mean.vec $model_dir/ +fi diff --git a/egs/chime6/s5b_track2/local/train_lms_srilm.sh b/egs/chime6/s5b_track2/local/train_lms_srilm.sh new file mode 120000 index 00000000000..a7666f6cded --- /dev/null +++ b/egs/chime6/s5b_track2/local/train_lms_srilm.sh @@ -0,0 +1 @@ +../../s5_track1/local/train_lms_srilm.sh \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/train_sad.sh b/egs/chime6/s5b_track2/local/train_sad.sh new file mode 100755 index 00000000000..cbaf3dfc5de --- /dev/null +++ b/egs/chime6/s5b_track2/local/train_sad.sh @@ -0,0 +1,155 @@ +#!/usr/bin/env bash + +# Copyright 2017 Nagendra Kumar Goel +# 2017 Vimal Manohar +# 2019 Desh Raj +# Apache 2.0 + +# This script is based on local/run_asr_segmentation.sh script in the +# Aspire recipe. It demonstrates nnet3-based speech activity detection for +# segmentation. +# This script: +# 1) Prepares targets (per-frame labels) for a subset of training data +# using GMM models +# 2) Trains TDNN+Stats or TDNN+LSTM neural network using the targets +# 3) Demonstrates using the SAD system to get segments of dev data + +lang=data/lang # Must match the one used to train the models +lang_test=data/lang_test # Lang directory for decoding. + +data_dir= +test_sets= +# Model directory used to align the $data_dir to get target labels for training +# SAD. This should typically be a speaker-adapted system. +sat_model_dir= +# Model direcotry used to decode the whole-recording version of the $data_dir to +# get target labels for training SAD. This should typically be a +# speaker-independent system like LDA+MLLT system. +model_dir= +graph_dir= # Graph for decoding whole-recording version of $data_dir. + # If not provided, a new one will be created using $lang_test + +# List of weights on labels obtained from alignment; +# labels obtained from decoding; and default labels in out-of-segment regions +merge_weights=1.0,0.1,0.5 + +prepare_targets_stage=-10 +nstage=-10 +train_stage=-10 +stage=0 +nj=50 +reco_nj=40 + +# test options +test_nj=10 + +. ./cmd.sh +. ./conf/sad.conf + +if [ -f ./path.sh ]; then . ./path.sh; fi + +set -e -u -o pipefail +. utils/parse_options.sh + +if [ $# -ne 0 ]; then + exit 1 +fi + +dir=exp/segmentation${affix} +sad_work_dir=exp/sad${affix}_${nnet_type}/ +sad_nnet_dir=$dir/tdnn_${nnet_type}_sad_1a + +mkdir -p $dir +mkdir -p ${sad_work_dir} + +# See $lang/phones.txt and decide which should be garbage +garbage_phones="laughs inaudible" +silence_phones="sil spn noise" + +for p in $garbage_phones; do + for a in "" "_B" "_E" "_I" "_S"; do + echo "$p$a" + done +done > $dir/garbage_phones.txt + +for p in $silence_phones; do + for a in "" "_B" "_E" "_I" "_S"; do + echo "$p$a" + done +done > $dir/silence_phones.txt + +if ! cat $dir/garbage_phones.txt $dir/silence_phones.txt | \ + steps/segmentation/internal/verify_phones_list.py $lang/phones.txt; then + echo "$0: Invalid $dir/{silence,garbage}_phones.txt" + exit 1 +fi + +# The training data may already be segmented, so we first prepare +# a "whole" training data (not segmented) for training the SAD +# system. + +whole_data_dir=${data_dir}_whole +whole_data_id=$(basename $whole_data_dir) + +if [ $stage -le 0 ]; then + utils/data/convert_data_dir_to_whole.sh $data_dir $whole_data_dir +fi + +############################################################################### +# Extract features for the whole data directory. We extract 13-dim MFCCs to +# generate targets using the GMM system, and 40-dim MFCCs to train the NN-based +# SAD. +############################################################################### +if [ $stage -le 1 ]; then + steps/make_mfcc.sh --nj $reco_nj --cmd "$train_cmd" --write-utt2num-frames true \ + --mfcc-config conf/mfcc.conf \ + $whole_data_dir exp/make_mfcc/${whole_data_id} + steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/${whole_data_id} + utils/fix_data_dir.sh $whole_data_dir + + utils/copy_data_dir.sh $whole_data_dir ${whole_data_dir}_hires + steps/make_mfcc.sh --nj $reco_nj --cmd "$train_cmd" --write-utt2num-frames true \ + --mfcc-config conf/mfcc_hires.conf \ + ${whole_data_dir}_hires exp/make_mfcc/${whole_data_id}_hires + steps/compute_cmvn_stats.sh ${whole_data_dir}_hires exp/make_mfcc/${whole_data_id}_hires + utils/fix_data_dir.sh ${whole_data_dir}_hires +fi + +############################################################################### +# Prepare SAD targets for recordings +############################################################################### +targets_dir=$dir/${whole_data_id}_combined_targets_sub3 +if [ $stage -le 2 ]; then + steps/segmentation/prepare_targets_gmm.sh --stage $prepare_targets_stage \ + --train-cmd "$train_cmd" --decode-cmd "$decode_cmd" \ + --nj $nj --reco-nj $reco_nj --lang-test $lang \ + --garbage-phones-list $dir/garbage_phones.txt \ + --silence-phones-list $dir/silence_phones.txt \ + --merge-weights "$merge_weights" \ + --remove-mismatch-frames false \ + --graph-dir "$graph_dir" \ + $lang $data_dir $whole_data_dir $sat_model_dir $model_dir $dir +fi + +############################################################################### +# Train a neural network for SAD +############################################################################### +if [ $stage -le 3 ]; then + if [ $nnet_type == "stats" ]; then + # Train a STATS-pooling network for SAD + local/segmentation/tuning/train_stats_sad_1a.sh \ + --stage $nstage --train-stage $train_stage \ + --targets-dir ${targets_dir} \ + --data-dir ${whole_data_dir}_hires --affix "1a" || exit 1 + + elif [ $nnet_type == "lstm" ]; then + # Train a TDNN+LSTM network for SAD + local/segmentation/tuning/train_lstm_sad_1a.sh \ + --stage $nstage --train-stage $train_stage \ + --targets-dir ${targets_dir} \ + --data-dir ${whole_data_dir}_hires --affix "1a" || exit 1 + + fi +fi + +exit 0; diff --git a/egs/chime6/s5b_track2/local/truncate_rttm.py b/egs/chime6/s5b_track2/local/truncate_rttm.py new file mode 100755 index 00000000000..3de0c0a60d6 --- /dev/null +++ b/egs/chime6/s5b_track2/local/truncate_rttm.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +# Apache 2.0 +# This script truncates the rttm file +# using UEM file and writes it to a new rttm file +# +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +from scorelib.turn import trim_turns +import scorelib.rttm as rttm_func +from scorelib.uem import load_uem + +def get_args(): + parser = argparse.ArgumentParser( + description="""This script truncates the rttm file + using UEM file""") + parser.add_argument("rttm_file", type=str, + help="""Input RTTM file. + The format of the RTTM file is + """ + """ """) + parser.add_argument("uem_file", type=str, + help="""Input UEM file. + The format of the UEM file is + """) + parser.add_argument("rttm_file_write", type=str, + help="""output RTTM file.""") + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = get_args() + rttm_writer = open(args.rttm_file_write, 'w') + turns, speaker_ids, file_ids = rttm_func.load_rttm(args.rttm_file) + loaded_uem = load_uem(args.uem_file) + truncated_turns = trim_turns(turns, loaded_uem) + rttm_func.write_rttm(args.rttm_file_write,truncated_turns) diff --git a/egs/chime6/s5b_track2/local/uem_file b/egs/chime6/s5b_track2/local/uem_file new file mode 100644 index 00000000000..c1d4dbcd5d4 --- /dev/null +++ b/egs/chime6/s5b_track2/local/uem_file @@ -0,0 +1,20 @@ +S01_U01 1 0 12000 +S02_U01 1 75 12000 +S09_U01 1 64 12000 +S21_U01 1 59 12000 +S01_U02 1 0 12000 +S02_U02 1 75 12000 +S09_U02 1 64 12000 +S21_U02 1 59 12000 +S01_U03 1 0 12000 +S02_U03 1 75 12000 +S09_U03 1 64 12000 +S21_U03 1 59 12000 +S01_U04 1 0 12000 +S02_U04 1 75 12000 +S09_U04 1 64 12000 +S21_U04 1 59 12000 +S01_U06 1 0 12000 +S02_U06 1 75 12000 +S09_U06 1 64 12000 +S21_U06 1 59 12000 diff --git a/egs/chime6/s5b_track2/local/wer_output_filter b/egs/chime6/s5b_track2/local/wer_output_filter new file mode 120000 index 00000000000..12a6c616d3d --- /dev/null +++ b/egs/chime6/s5b_track2/local/wer_output_filter @@ -0,0 +1 @@ +../../s5_track1/local/wer_output_filter \ No newline at end of file diff --git a/egs/chime6/s5b_track2/path.sh b/egs/chime6/s5b_track2/path.sh new file mode 100644 index 00000000000..2f4e4e4fb21 --- /dev/null +++ b/egs/chime6/s5b_track2/path.sh @@ -0,0 +1,9 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH +export PATH=$PWD/dscore:$PATH +export PYTHONPATH="${PYTHONPATH}:$PWD/dscore" +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C + diff --git a/egs/chime6/s5b_track2/run.sh b/egs/chime6/s5b_track2/run.sh new file mode 100755 index 00000000000..d5548518287 --- /dev/null +++ b/egs/chime6/s5b_track2/run.sh @@ -0,0 +1,300 @@ +#!/usr/bin/env bash +# +# Chime-6 Track 2 baseline. Based mostly on the Chime-5 recipe, with the exception +# that we are required to perform speech activity detection and speaker +# diarization before ASR, since we do not have access to the oracle SAD and +# diarization labels. +# +# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal) +# 2019 Desh Raj, David Snyder, Ashish Arora +# Apache 2.0 + +# Begin configuration section. +nj=50 +decode_nj=20 +stage=0 +nnet_stage=-10 +sad_stage=0 +diarizer_stage=0 +decode_stage=0 +enhancement=beamformit # for a new enhancement method, + # change this variable and decode stage +decode_only=false +num_data_reps=4 +snrs="20:10:15:5:0" +foreground_snrs="20:10:15:5:0" +background_snrs="20:10:15:5:0" +# End configuration section +. ./utils/parse_options.sh + +. ./cmd.sh +. ./path.sh + +if [ $decode_only == "true" ]; then + stage=18 +fi + +set -e # exit on error + +# chime5 main directory path +# please change the path accordingly +chime5_corpus=/export/corpora4/CHiME5 +# chime6 data directories, which are generated from ${chime5_corpus}, +# to synchronize audio files across arrays and modify the annotation (JSON) file accordingly +chime6_corpus=${PWD}/CHiME6 +json_dir=${chime6_corpus}/transcriptions +audio_dir=${chime6_corpus}/audio + +# training and test data +train_set=train_worn_simu_u400k +sad_train_set=train_worn_u400k +test_sets="dev_${enhancement}_dereverb eval_${enhancement}_dereverb" + +# This script also needs the phonetisaurus g2p, srilm, beamformit +./local/check_tools.sh || exit 1; + +########################################################################### +# We first generate the synchronized audio files across arrays and +# corresponding JSON files. Note that this requires sox v14.4.2, +# which is installed via miniconda in ./local/check_tools.sh +########################################################################### + +if [ $stage -le 0 ]; then + local/generate_chime6_data.sh \ + --cmd "$train_cmd" \ + ${chime5_corpus} \ + ${chime6_corpus} +fi + +########################################################################### +# We prepare dict and lang in stages 1 to 3. +########################################################################### + +if [ $stage -le 1 ]; then + # skip u03 and u04 as they are missing + for mictype in worn u01 u02 u05 u06; do + local/prepare_data.sh --mictype ${mictype} --train true \ + ${audio_dir}/train ${json_dir}/train data/train_${mictype} + done + for dataset in dev; do + for mictype in worn; do + local/prepare_data.sh --mictype ${mictype} --train true \ + ${audio_dir}/${dataset} ${json_dir}/${dataset} \ + data/${dataset}_${mictype} + done + done +fi + +if [ $stage -le 2 ]; then + local/prepare_dict.sh + + utils/prepare_lang.sh \ + data/local/dict "" data/local/lang data/lang + + local/train_lms_srilm.sh \ + --train-text data/train_worn/text --dev-text data/dev_worn/text \ + --oov-symbol "" --words-file data/lang/words.txt \ + data/ data/srilm +fi + +LM=data/srilm/best_3gram.gz +if [ $stage -le 3 ]; then + # Compiles G for chime5 trigram LM + utils/format_lm.sh \ + data/lang $LM data/local/dict/lexicon.txt data/lang + +fi + +if [ $stage -le 4 ]; then + # remove possibly bad sessions (P11_S03, P52_S19, P53_S24, P54_S24) + # see http://spandh.dcs.shef.ac.uk/chime_challenge/data.html for more details + utils/copy_data_dir.sh data/train_worn data/train_worn_org # back up + grep -v -e "^P11_S03" -e "^P52_S19" -e "^P53_S24" -e "^P54_S24" data/train_worn_org/text > data/train_worn/text + utils/fix_data_dir.sh data/train_worn + + # Remove S12_U05 from training data since it has known issues + utils/copy_data_dir.sh data/train_u05 data/train_u05_org # back up + grep -v -e "^S12_U05" data/train_u05_org/text > data/train_u05/text + utils/fix_data_dir.sh data/train_u05 +fi + +######################################################################################### +# In stages 5 and 6, we augment and fix train data for our training purpose. point source +# noises are extracted from chime corpus. Here we use 400k utterances from array microphones, +# its augmentation and all the worn set utterances in train. +######################################################################################### + +if [ $stage -le 5 ]; then + echo "$0: Extracting noise list from training data" + local/extract_noises.py $chime6_corpus/audio/train $chime6_corpus/transcriptions/train \ + local/distant_audio_list distant_noises + local/make_noise_list.py distant_noises > distant_noise_list + + noise_list=distant_noise_list + + echo "$0: Preparing simulated RIRs for data augmentation" + if [ ! -d RIRS_NOISES/ ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + + # This is the config for the system using simulated RIRs and point-source noises + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + rvb_opts+=(--noise-set-parameters $noise_list) + + steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --prefix "rev" \ + --foreground-snrs $foreground_snrs \ + --background-snrs $background_snrs \ + --speech-rvb-probability 1 \ + --pointsource-noise-addition-probability 1 \ + --isotropic-noise-addition-probability 1 \ + --num-replications $num_data_reps \ + --max-noises-per-minute 1 \ + --source-sampling-rate 16000 \ + data/train_worn data/train_worn_rvb +fi + +if [ $stage -le 6 ]; then + # combine mix array and worn mics + # randomly extract first 400k utterances from all mics + # if you want to include more training data, you can increase the number of array mic utterances + utils/combine_data.sh data/train_uall data/train_u01 data/train_u02 data/train_u05 data/train_u06 + utils/subset_data_dir.sh data/train_uall 400000 data/train_u400k + utils/combine_data.sh data/${train_set} data/train_worn data/train_worn_rvb data/train_u400k + utils/combine_data.sh data/${sad_train_set} data/train_worn data/train_u400k +fi + +if [ $stage -le 7 ]; then + # Split speakers up into 3-minute chunks. This doesn't hurt adaptation, and + # lets us use more jobs for decoding etc. + utils/copy_data_dir.sh data/${train_set} data/${train_set}_nosplit + utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${train_set}_nosplit data/${train_set} +fi + +################################################################################## +# Now make MFCC features. We use 13-dim MFCCs to train the GMM-HMM models. +################################################################################## + +if [ $stage -le 8 ]; then + # Now make MFCC features. + # mfccdir should be some place with a largish disk where you + # want to store MFCC features. + echo "$0: make features..." + mfccdir=mfcc + steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \ + --mfcc-config conf/mfcc.conf \ + data/${train_set} exp/make_mfcc/${train_set} $mfccdir + steps/compute_cmvn_stats.sh data/${train_set} exp/make_mfcc/${train_set} $mfccdir + utils/fix_data_dir.sh data/${train_set} +fi + +################################################################################### +# Stages 9 to 14 train monophone and triphone models. They will be used for +# generating lattices for training the chain model and for obtaining targets +# for training the SAD system. +################################################################################### + +if [ $stage -le 9 ]; then + # make a subset for monophone training + utils/subset_data_dir.sh --shortest data/${train_set} 100000 data/${train_set}_100kshort + utils/subset_data_dir.sh data/${train_set}_100kshort 30000 data/${train_set}_30kshort +fi + +if [ $stage -le 10 ]; then + # Starting basic training on MFCC features + steps/train_mono.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set}_30kshort data/lang exp/mono +fi + +if [ $stage -le 11 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/mono exp/mono_ali + + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 30000 data/${train_set} data/lang exp/mono_ali exp/tri1 +fi + +if [ $stage -le 12 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/tri1 exp/tri1_ali + + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 4000 50000 data/${train_set} data/lang exp/tri1_ali exp/tri2 +fi + +if [ $stage -le 13 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/tri2 exp/tri2_ali + + steps/train_sat.sh --cmd "$train_cmd" \ + 5000 100000 data/${train_set} data/lang exp/tri2_ali exp/tri3 +fi + +if [ $stage -le 14 ]; then + # The following script cleans the data and produces cleaned data + steps/cleanup/clean_and_segment_data.sh --nj $nj --cmd "$train_cmd" \ + --segmentation-opts "--min-segment-length 0.3 --min-new-segment-length 0.6" \ + data/${train_set} data/lang exp/tri3 exp/tri3_cleaned data/${train_set}_cleaned +fi + +########################################################################## +# CHAIN MODEL TRAINING +# You can also download a pretrained chain ASR model using: +# wget http://kaldi-asr.org/models/12/0012_asr_v1.tar.gz +# Once it is downloaded, extract using: tar -xvzf 0012_asr_v1.tar.gz +# and copy the contents of the exp/ directory to your exp/ +########################################################################## +if [ $stage -le 15 ]; then + # chain TDNN + local/chain/run_tdnn.sh --nj $nj \ + --stage $nnet_stage \ + --train-set ${train_set}_cleaned \ + --test-sets "$test_sets" \ + --gmm tri3_cleaned --nnet3-affix _${train_set}_cleaned_rvb +fi + +########################################################################## +# SAD MODEL TRAINING +# You can also download a pretrained SAD model using: +# wget http://kaldi-asr.org/models/12/0012_sad_v1.tar.gz +# Once it is downloaded, extract using: tar -xvzf 0012_sad_v1.tar.gz +# and copy the contents of the exp/ directory to your exp/ +########################################################################## +if [ $stage -le 16 ]; then + local/train_sad.sh --stage $sad_stage --nj $nj \ + --data-dir data/${sad_train_set} --test-sets "${test_sets}" \ + --sat-model-dir exp/tri3_cleaned \ + --model-dir exp/tri2 +fi + +########################################################################## +# DIARIZATION MODEL TRAINING +# You can also download a pretrained diarization model using: +# wget http://kaldi-asr.org/models/12/0012_diarization_v1.tar.gz +# Once it is downloaded, extract using: tar -xvzf 0012_diarization_v1.tar.gz +# and copy the contents of the exp/ directory to your exp/ +########################################################################## +if [ $stage -le 17 ]; then + local/train_diarizer.sh --stage $diarizer_stage \ + --data-dir data/${train_set} \ + --model-dir exp/xvector_nnet_1a +fi + +########################################################################## +# DECODING: In track 2, we are given raw utterances without segment +# or speaker information, so we have to decode the whole pipeline, i.e., +# SAD -> Diarization -> ASR. This is done in the local/decode.sh +# script. +########################################################################## +if [ $stage -le 18 ]; then + local/decode.sh --stage $decode_stage \ + --enhancement $enhancement \ + --test-sets "$test_sets" +fi + +exit 0; + diff --git a/egs/chime6/s5b_track2/sid b/egs/chime6/s5b_track2/sid new file mode 120000 index 00000000000..893a12f30c9 --- /dev/null +++ b/egs/chime6/s5b_track2/sid @@ -0,0 +1 @@ +../../sre08/v1/sid \ No newline at end of file diff --git a/egs/chime6/s5b_track2/steps b/egs/chime6/s5b_track2/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/chime6/s5b_track2/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/chime6/s5b_track2/utils b/egs/chime6/s5b_track2/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/chime6/s5b_track2/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file From f3eb364b240713e33a8c6cd77aaa656bf08a37ad Mon Sep 17 00:00:00 2001 From: medennikov Date: Wed, 27 May 2020 23:24:06 +0300 Subject: [PATCH 02/10] Add scripts for TS-VAD iterative diarization with a trained model --- egs/chime6/s5b_track2/local/decode_ts-vad.sh | 273 ++++++++++++++++++ .../s5b_track2/local/prepare_diarized_data.sh | 60 ++++ .../local/ts-vad/compute_ts-vad_weights.sh | 164 +++++++++++ .../local/ts-vad/convert_prob_to_rttm.py | 165 +++++++++++ .../local/ts-vad/diarize_TS-VAD_it1.sh | 154 ++++++++++ .../local/ts-vad/diarize_TS-VAD_it2.sh | 209 ++++++++++++++ .../local/ts-vad/modify_ups_utt2spk.pl | 40 +++ .../local/ts-vad/split_feats_seg.pl | 71 +++++ .../s5b_track2/local/ts-vad/vad_prob_mod.py | 97 +++++++ egs/chime6/s5b_track2/run.sh | 20 +- 10 files changed, 1249 insertions(+), 4 deletions(-) create mode 100755 egs/chime6/s5b_track2/local/decode_ts-vad.sh create mode 100755 egs/chime6/s5b_track2/local/prepare_diarized_data.sh create mode 100755 egs/chime6/s5b_track2/local/ts-vad/compute_ts-vad_weights.sh create mode 100644 egs/chime6/s5b_track2/local/ts-vad/convert_prob_to_rttm.py create mode 100755 egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it1.sh create mode 100755 egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it2.sh create mode 100755 egs/chime6/s5b_track2/local/ts-vad/modify_ups_utt2spk.pl create mode 100755 egs/chime6/s5b_track2/local/ts-vad/split_feats_seg.pl create mode 100644 egs/chime6/s5b_track2/local/ts-vad/vad_prob_mod.py diff --git a/egs/chime6/s5b_track2/local/decode_ts-vad.sh b/egs/chime6/s5b_track2/local/decode_ts-vad.sh new file mode 100755 index 00000000000..f410d72d701 --- /dev/null +++ b/egs/chime6/s5b_track2/local/decode_ts-vad.sh @@ -0,0 +1,273 @@ +#!/usr/bin/env bash +# +# This script decodes raw utterances through the entire pipeline: +# Feature extraction -> SAD -> Diarization -> TS-VAD diarization -> ASR +# +# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal) +# 2019 Desh Raj, David Snyder, Ashish Arora, Zhaoheng Ni +# 2020 Ivan Medennikov +# Apache 2.0 + +# Begin configuration section. +nj=8 +stage=0 +sad_stage=0 +score_sad=true +diarizer_stage=0 +score_stage=0 +ts_vad_num_iters=3 + +enhancement=beamformit + +# option to use the new RTTM reference for sad and diarization +use_new_rttm_reference=true +if $use_new_rttm_reference == "true" ; then + git clone https://github.com/nateanl/chime6_rttm +fi + +# chime5 main directory path +# please change the path accordingly +chime5_corpus=/export/corpora4/CHiME5 +# chime6 data directories, which are generated from ${chime5_corpus}, +# to synchronize audio files across arrays and modify the annotation (JSON) file accordingly +chime6_corpus=${PWD}/CHiME6 +json_dir=${chime6_corpus}/transcriptions +audio_dir=${chime6_corpus}/audio + +enhanced_dir=enhanced +enhanced_dir=$(utils/make_absolute.sh $enhanced_dir) || exit 1 + +# training data +train_set=train_worn_simu_u400k +test_sets="dev_${enhancement}_dereverb eval_${enhancement}_dereverb" + +# ts-vad +ts_vad_dir=exp/ts-vad_b +ivector_dir=exp/nnet3_b + +. ./utils/parse_options.sh + +. ./cmd.sh +. ./path.sh +. ./conf/sad.conf + +# This script also needs the phonetisaurus g2p, srilm, beamformit +#./local/check_tools.sh || exit 1 + +########################################################################### +# We first generate the synchronized audio files across arrays and +# corresponding JSON files. Note that this requires sox v14.4.2, +# which is installed via miniconda in ./local/check_tools.sh +########################################################################### + +if [ $stage -le 0 ]; then + local/generate_chime6_data.sh \ + --cmd "$train_cmd" \ + ${chime5_corpus} \ + ${chime6_corpus} +fi + +####################################################################### +# Prepare the dev and eval data with dereverberation (WPE) and +# beamforming. +####################################################################### +if [ $stage -le 1 ]; then + # Beamforming using reference arrays + # enhanced WAV directory + enhandir=enhan + dereverb_dir=${PWD}/wav/wpe/ + + for dset in dev eval; do + for mictype in u01 u02 u03 u04 u06; do + local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 20G" \ + ${audio_dir}/${dset} \ + ${dereverb_dir}/${dset} \ + ${mictype} + done + done + + for dset in dev eval; do + for mictype in u01 u02 u03 u04 u06; do + local/run_beamformit.sh --cmd "$train_cmd" \ + ${dereverb_dir}/${dset} \ + ${enhandir}/${dset}_${enhancement}_${mictype} \ + ${mictype} + done + done + + # Note that for the evaluation sets, we use the flag + # "--train false". This keeps the files segments, text, + # and utt2spk with .bak extensions, so that they can + # be used later for scoring if needed but are not used + # in the intermediate stages. + for dset in dev eval; do + local/prepare_data.sh --mictype ref --train false \ + "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \ + ${json_dir}/${dset} data/${dset}_${enhancement}_dereverb + done + +fi + +if [ $stage -le 2 ]; then + # mfccdir should be some place with a largish disk where you + # want to store MFCC features. + mfccdir=mfcc + for x in ${test_sets}; do + steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \ + --mfcc-config conf/mfcc_hires.conf \ + data/$x exp/make_mfcc/$x $mfccdir + done +fi + +####################################################################### +# Perform SAD on the dev/eval data +####################################################################### +dir=exp/segmentation${affix} +sad_work_dir=exp/sad${affix}_${nnet_type}/ +sad_nnet_dir=$dir/tdnn_${nnet_type}_sad_1a + +if [ $stage -le 3 ]; then + for datadir in ${test_sets}; do + test_set=data/${datadir} + if [ ! -f ${test_set}/wav.scp ]; then + echo "$0: Not performing SAD on ${test_set}" + exit 0 + fi + # Perform segmentation + local/segmentation/detect_speech_activity.sh --nj $nj --stage $sad_stage \ + $test_set $sad_nnet_dir mfcc $sad_work_dir \ + data/${datadir} || exit 1 + + test_dir=data/${datadir}_${nnet_type}_seg + mv data/${datadir}_seg ${test_dir}/ + cp data/${datadir}/{segments.bak,utt2spk.bak} ${test_dir}/ + # Generate RTTM file from segmentation performed by SAD. This can + # be used to evaluate the performance of the SAD as an intermediate + # step. + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ + ${test_dir}/utt2spk ${test_dir}/segments ${test_dir}/rttm + + if [ $score_sad == "true" ]; then + echo "Scoring $datadir.." + # We first generate the reference RTTM from the backed up utt2spk and segments + # files. + ref_rttm=${test_dir}/ref_rttm + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py ${test_dir}/utt2spk.bak \ + ${test_dir}/segments.bak ${test_dir}/ref_rttm + + # To score, we select just U06 segments from the hypothesis RTTM. + hyp_rttm=${test_dir}/rttm.U06 + grep 'U06' ${test_dir}/rttm > ${test_dir}/rttm.U06 + echo "Array U06 selected for scoring.." + + if $use_new_rttm_reference == "true"; then + echo "Use the new RTTM reference." + mode="$(cut -d'_' -f1 <<<"$datadir")" + ref_rttm=./chime6_rttm/${mode}_rttm + fi + + sed 's/_U0[1-6].ENH//g' $ref_rttm > $ref_rttm.scoring + sed 's/_U0[1-6].ENH//g' $hyp_rttm > $hyp_rttm.scoring + cat ./local/uem_file | grep 'U06' | sed 's/_U0[1-6]//g' > ./local/uem_file.tmp + md-eval.pl -1 -c 0.25 -u ./local/uem_file.tmp -r $ref_rttm.scoring -s $hyp_rttm.scoring |\ + awk 'or(/MISSED SPEECH/,/FALARM SPEECH/)' + fi + done +fi + +####################################################################### +# Perform diarization on the dev/eval data +####################################################################### +if [ $stage -le 4 ]; then + for datadir in ${test_sets}; do + if $use_new_rttm_reference == "true"; then + mode="$(cut -d'_' -f1 <<<"$datadir")" + ref_rttm=./chime6_rttm/${mode}_rttm + else + ref_rttm=data/${datadir}_${nnet_type}_seg/ref_rttm + fi + local/diarize.sh --nj $nj --cmd "$train_cmd" --stage $diarizer_stage \ + --ref-rttm $ref_rttm \ + exp/xvector_nnet_1a \ + data/${datadir}_${nnet_type}_seg \ + exp/${datadir}_${nnet_type}_seg_diarization + done +fi + +####################################################################### +# Perform TS-VAD diarization on the dev/eval data +####################################################################### +if [ $stage -le 5 ]; then + for datadir in ${test_sets}; do + mode="$(cut -d'_' -f1 <<<"$datadir")" + if $use_new_rttm_reference == "true"; then + ref_rttm=./chime6_rttm/${mode}_rttm + else + ref_rttm=data/${datadir}_${nnet_type}_seg/ref_rttm + fi + + [ ! -f data/${datadir}_diarized_hires/feats.scp ] && \ + local/prepare_diarized_data.sh --cmd "$train_cmd" \ + exp/${datadir}_${nnet_type}_seg_diarization \ + data/$datadir data/${datadir}_diarized + + # 1st iteration + it=1 + ivector_affix=baseline-init + local/ts-vad/diarize_TS-VAD_it1.sh --cmd "$train_cmd" \ + --ref-rttm $ref_rttm \ + --ivector-affix $ivector_affix \ + $ts_vad_dir $ivector_dir ${datadir}_diarized \ + $ts_vad_dir/it${it}_${ivector_affix} || exit 1 + + initdir=$ts_vad_dir/it${it}_${ivector_affix}/${datadir}_U06_hires_split10000 + # 2nd and further iterations + while [ $it -lt $ts_vad_num_iters ]; do + ivector_affix=it${it}-init + it=$((it+1)) + local/ts-vad/diarize_TS-VAD_it2.sh --cmd "$train_cmd" \ + --ref-rttm $ref_rttm \ + --it $it \ + --ivector-affix $ivector_affix \ + --channels "CH1 CH2 CH3 CH4" \ + --audio_dir $audio_dir \ + $ts_vad_dir $ivector_dir $initdir \ + $ts_vad_dir/it${it}_${ivector_affix} || exit 1 + initdir=$ts_vad_dir/it${it}_${ivector_affix}/${mode}_20ch-AVG_hires_split10000_18ups + done + + if [ ! -f data/${datadir}_ts-vad-it${ts_vad_num_iters}-diarized_hires/feats.scp ]; then + cat $initdir/scoring/rttm | awk '{$2=$2"_U06"; print $0}' > $initdir/rttm + local/prepare_diarized_data.sh --cmd "$train_cmd" \ + $initdir data/$datadir data/${datadir}_ts-vad-it${ts_vad_num_iters}-diarized || exit 1 + fi + done +fi + +####################################################################### +# Decode diarized output using trained chain model +####################################################################### +if [ $stage -le 6 ]; then + for datadir in ${test_sets}; do + echo "$0 performing decoding on the extracted features" + asr_model_dir=exp/chain_${train_set}_cleaned_rvb + local/nnet3/decode.sh --affix 2stage --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 150 --nj $nj --ivector-dir exp/nnet3_${train_set}_cleaned_rvb \ + data/${datadir}_ts-vad-it${ts_vad_num_iters}-diarized data/lang $asr_model_dir/tree_sp/graph $asr_model_dir/tdnn1b_sp/ || exit 1 + done +fi + +####################################################################### +# Score decoded dev/eval sets +####################################################################### +if [ $stage -le 7 ]; then + # final scoring to get the challenge result + # please specify both dev and eval set directories so that the search parameters + # (insertion penalty and language model weight) will be tuned using the dev set + local/score_for_submit.sh --stage $score_stage \ + --dev_decodedir exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_dereverb_ts-vad-it${ts_vad_num_iters}-diarized_2stage \ + --dev_datadir dev_beamformit_dereverb_ts-vad-it${ts_vad_num_iters}-diarized_hires \ + --eval_decodedir exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_eval_beamformit_dereverb_ts-vad-it${ts_vad_num_iters}-diarized_2stage \ + --eval_datadir eval_beamformit_dereverb_ts-vad-it${ts_vad_num_iters}-diarized_hires +fi +exit 0; diff --git a/egs/chime6/s5b_track2/local/prepare_diarized_data.sh b/egs/chime6/s5b_track2/local/prepare_diarized_data.sh new file mode 100755 index 00000000000..52468cb138b --- /dev/null +++ b/egs/chime6/s5b_track2/local/prepare_diarized_data.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +# Copyright 2019 Ashish Arora, Vimal Manohar +# 2020 Ivan Medennikov +# Apache 2.0. +# This script takes an rttm file, and prepares a diarized data directory. +# The output directory contains a text file which can be used for scoring. + +stage=0 +nj=8 +cmd=run.pl +echo "$0 $@" # Print the command line for logging +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/rttm data/dev data/dev_diarized" + echo "Options: " + echo " --nj # number of parallel jobs." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +rttm_dir=$1 +data_in=$2 +out_dir=$3 + +for f in $rttm_dir/rttm $data_in/wav.scp; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +if [ $stage -le 0 ]; then + echo "$0 copying data files in output directory" + cp $rttm_dir/rttm $rttm_dir/rttm_1 + sed -i 's/'.ENH'/''/g' $rttm_dir/rttm_1 + # removing participant introduction from the hypothesis rttm + # UEM file contains the scoring durations for each recording + local/truncate_rttm.py $rttm_dir/rttm_1 local/uem_file $rttm_dir/rttm_introduction_removed + mkdir -p ${out_dir}_hires + cp ${data_in}/{wav.scp,utt2spk} ${out_dir}_hires + utils/data/get_reco2dur.sh ${out_dir}_hires +fi + +if [ $stage -le 1 ]; then + echo "$0 creating segments file from rttm and utt2spk, reco2file_and_channel " + local/convert_rttm_to_utt2spk_and_segments.py --append-reco-id-to-spkr=true $rttm_dir/rttm_introduction_removed \ + <(awk '{print $2".ENH "$2" "$3}' $rttm_dir/rttm_introduction_removed |sort -u) \ + ${out_dir}_hires/utt2spk ${out_dir}_hires/segments + + utils/utt2spk_to_spk2utt.pl ${out_dir}_hires/utt2spk > ${out_dir}_hires/spk2utt + + awk '{print $1" "$1" 1"}' ${out_dir}_hires/wav.scp > ${out_dir}_hires/reco2file_and_channel + utils/fix_data_dir.sh ${out_dir}_hires || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "$0 extracting mfcc freatures using segments file" + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd "$cmd" ${out_dir}_hires + steps/compute_cmvn_stats.sh ${out_dir}_hires + cp $data_in/text.bak ${out_dir}_hires/text +fi diff --git a/egs/chime6/s5b_track2/local/ts-vad/compute_ts-vad_weights.sh b/egs/chime6/s5b_track2/local/ts-vad/compute_ts-vad_weights.sh new file mode 100755 index 00000000000..425b90e92ee --- /dev/null +++ b/egs/chime6/s5b_track2/local/ts-vad/compute_ts-vad_weights.sh @@ -0,0 +1,164 @@ +#!/bin/bash +# Copyright 2012 Brno University of Technology (Author: Karel Vesely) +# 2013 Johns Hopkins University (Author: Daniel Povey) +# 2015 Vijayaditya Peddinti +# 2016 Vimal Manohar +# 2017 Pegah Ghahremani +# 2020 Ivan Medennikov +# Apache 2.0 + +# Computes training alignments using nnet3 DNN, with output to lattices. + +# Begin configuration section. +nj=4 +cmd=run.pl +stage=0 +# Begin configuration. +srcdir= +frames_per_chunk=50 +extra_left_context=0 +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 +online_ivector_dir= +graphs_scp= +max_jobs_run=20 +n_spk=4 + +normalize_transform= +add_deltas=false +delta_opts= +num_threads=1 +use_gpu=true +mb_size=128 +optimize=false +apply_exp=true +use_subsampling=false +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train exp/nnet4/bnex.raw data_bn/train" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +extractor=$2 +dir=$3 + +if [ -f $dir/.done ]; then + echo "$0: $dir/.done already exists!" + exit 0; +fi + +[ -z $srcdir ] && srcdir=`dirname $extractor` + +mkdir -p $dir/{log,tmp} +sdata=$data/split${nj} +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || \ + split_data.sh $data $nj || exit 1; + +extra_files= +if [ ! -z "$online_ivector_dir" ]; then + steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1 + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" +fi + +for f in $extractor $data/feats.scp $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +delta_opts=`cat $srcdir/delta_opts 2>/dev/null` +[ ! -z "$delta_opts" ] && add_deltas=true + +[ -z "$normalize_transform" ] && [ -f $srcdir/normalize.feature_transform ] && normalize_transform=$srcdir/normalize.feature_transform +echo "normalize transform file: $normalize_transform" + +cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` + +feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" + +if [ ! -z "$normalize_transform" ]; then + feats="$feats nnet-forward $normalize_transform ark:- ark:- |" +fi + +if $add_deltas; then + feats="$feats add-deltas $delta_opts ark:- ark:- |" +fi + +ivector_opts= +if [ ! -z "$online_ivector_dir" ]; then + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" +fi + +frame_subsampling_opt= +if [ -f $srcdir/frame_subsampling_factor ] && $use_subsampling ; then + # e.g. for 'chain' systems + frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor) + frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor" + cp $srcdir/frame_subsampling_factor $dir + if [[ $frame_subsampling_factor -gt 1 ]]; then + # Assume a chain system, check agrument sanity. + if [[ ! ($scale_opts == *--self-loop-scale=1.0* && + $scale_opts == *--transition-scale=1.0* && + $acoustic_scale = '1.0') ]]; then + echo "$0: ERROR: frame-subsampling-factor is not 1, assuming a chain system." + echo "... You should pass the following options to this script:" + echo " --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0'" \ + "--acoustic_scale 1.0" + fi + fi +fi + +## +gpu_opt= +thread_string= +if $use_gpu ; then + thread_string="-batch --minibatch-size=$mb_size" + gpu_opt="--gpu 1" + use_gpu=wait +else + echo "Warning! GPU is disabled, are you okay?" + thread_string="" + use_gpu=no +fi + +if [ $stage -le 1 ]; then + $cmd --max-jobs-run $max_jobs_run $gpu_opt JOB=1:$nj $dir/log/nnet3_compute.JOB.log \ + nnet3-compute$thread_string $ivector_opts $frame_subsampling_opt \ + --apply-exp=$apply_exp \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ + --use-gpu=$use_gpu \ + $extractor "$feats" ark,t:$dir/tmp/outputs.JOB.ark || exit 1; + cat $dir/tmp/outputs.*.ark > $dir/outputs.ark + rm $dir/tmp/outputs.*.ark +fi + +if [ $stage -le 2 ]; then + [ -f $dir/weights.ark ] && rm $dir/weights.ark + for i in `seq $n_spk`; do + $cmd $dir/log/make_weights.$i.log \ + select-feats $((2*i-1)) ark:$dir/outputs.ark ark:- \| \ + feat-to-post ark:- ark:- \| \ + post-to-weights ark:- ark,t:"| sed s/\ /-$i\ / > $dir/weights.$i.ark" || exit 1; + done + cat $dir/weights.*.ark | sort > $dir/weights.ark + rm $dir/outputs.ark + rm $dir/weights.*.ark +fi + +echo "$0: done extracting weights" diff --git a/egs/chime6/s5b_track2/local/ts-vad/convert_prob_to_rttm.py b/egs/chime6/s5b_track2/local/ts-vad/convert_prob_to_rttm.py new file mode 100644 index 00000000000..e95f36c9734 --- /dev/null +++ b/egs/chime6/s5b_track2/local/ts-vad/convert_prob_to_rttm.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Yuri Khokhlov, Ivan Medennikov +# Apache 2.0. + +"""This script converts TS-VAD output probabilities to a NIST RTTM file. + +The segments file format is: + +The labels file format is: + + +The output RTTM format is: + \ + +where: + = "SPEAKER" + = + = "0" + = start time of segment + = duration of segment + = "" + = "" + = + = "" + = "" +""" + + +import os +import argparse +import regex as re +import numpy as np +from scipy import signal, ndimage +from kaldiio import ReadHelper + + +class Segment: + def __init__(self, begin, end, label): + self.begin = begin + self.end = end + self.label = label + + def length(self): + return self.end - self.begin + + +class VadProbSet: + def __init__(self, vad_rspec, reg_exp): + data = dict() + prev = -1 + with ReadHelper(vad_rspec) as reader: + for utid, prob in reader: + result = reg_exp.match(utid) + assert result is not None, 'Wrong utterance ID format: \"{}\"'.format(utid) + sess_indx = result.group(1) + spkr = result.group(2) + + result = reg_exp.match(sess_indx) + assert result is not None, 'Wrong utterance ID format: \"{}\"'.format(sess_indx) + sess = result.group(1) + indx = int(result.group(2)) + + sess = sess + '-' + spkr + + if sess not in data.keys(): + assert indx == 1 + prev = -1 + data[sess] = list() + assert indx >= prev + data[sess].append(prob) + prev = indx + reader.close() + print(' loaded {} sessions'.format(len(data))) + print(' combining fragments') + self.data = dict() + for sess, items in data.items(): + self.data[sess] = np.hstack(items) + + def apply_filter(self, window, threshold, threshold_first): + for sess in self.data.keys(): + if threshold_first: + self.data[sess] = np.vectorize(lambda value: 1.0 if value > threshold else 0.0)(self.data[sess]).astype(dtype=np.int32) + if window > 1: + self.data[sess] = signal.medfilt(self.data[sess], window).astype(dtype=np.int32) + else: + if window > 1: + self.data[sess] = signal.medfilt(self.data[sess], window) + self.data[sess] = np.vectorize(lambda value: 1.0 if value > threshold else 0.0)(self.data[sess]).astype(dtype=np.int32) + + def convert(self, frame_shift, min_silence, min_speech, out_rttm): + min_silence = int(round(min_silence / frame_shift)) + min_speech = int(round(min_speech / frame_shift)) + with open(out_rttm, 'wt', encoding='utf-8') as wstream: + for sess, prob in self.data.items(): + print(' session: {} num_frames: {} duration: {:.2f} hrs'.format(sess, len(prob), len(prob) * frame_shift / 60 / 60)) + segments = list() + for i, label in enumerate(prob): + if (len(segments) == 0) or (segments[-1].label != label): + segments.append(Segment(i, i + 1, label)) + else: + segments[-1].end += 1 + if (min_silence > 0) or (min_speech > 0): + items = segments + segments = list() + for segm in items: + if len(segments) == 0: + segments.append(segm) + elif segm.label == segments[-1].label: + segments[-1].end = segm.end + else: + min_length = min_silence if segm.label == 0 else min_speech + if segm.length() < min_length: + segments[-1].end = segm.end + else: + segments.append(segm) + for segm in segments: + if segm.label == 1: + begin = frame_shift * segm.begin + length = frame_shift * segm.length() + result = reg_exp.match(sess) + assert result is not None, 'Wrong format: \"{}\"'.format(sess) + utid = result.group(1) + spk = result.group(2) + wstream.write('SPEAKER {} 1 {:7.3f} {:7.3f} {} \n'.format(utid, begin, length, spk)) + wstream.close() + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Usage: convert_prob_to_wa.py ') + parser.add_argument("--frame_shift", "-s", type=float, default=0.010) + parser.add_argument("--reg_exp", "-x", type=str, default=r'^(\S+)-(\d+)$') + parser.add_argument("--window", "-w", type=int, default=1) + parser.add_argument("--threshold", "-t", type=float, default=0.5) + parser.add_argument("--threshold_first", "-r", action="store_true") + parser.add_argument("--min_silence", "-k", type=float, default=0.0) + parser.add_argument("--min_speech", "-m", type=float, default=0.0) + parser.add_argument('vad_rspec', type=str) + parser.add_argument('out_rttm', type=str) + args = parser.parse_args() + + print('Options:') + print(' Frame shift in sec: {}'.format(args.frame_shift)) + print(' Utterance ID regexp: {}'.format(args.reg_exp)) + print(' Med. filter window: {}'.format(args.window)) + print(' Prob. threshold: {}'.format(args.threshold)) + print(' Apply thresh. first: {}'.format(args.threshold_first)) + print(' Min silence length: {}'.format(args.min_silence)) + print(' Min speech length: {}'.format(args.min_speech)) + print(' VAD rspec: {}'.format(args.vad_rspec)) + print(' Output rttm file: {}'.format(args.out_rttm)) + + reg_exp = re.compile(args.reg_exp) + + parent = os.path.dirname(os.path.abspath(args.out_rttm)) + if not os.path.exists(parent): + os.makedirs(parent) + + print('Loading VAD probabilities') + vad_prob = VadProbSet(args.vad_rspec, reg_exp) + + print('Applying filtering') + vad_prob.apply_filter(args.window, args.threshold, args.threshold_first) + + print('Writing rttm') + vad_prob.convert(args.frame_shift, args.min_silence, args.min_speech, args.out_rttm) diff --git a/egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it1.sh b/egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it1.sh new file mode 100755 index 00000000000..fa80e9fb03b --- /dev/null +++ b/egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it1.sh @@ -0,0 +1,154 @@ +#!/bin/bash +# Copyright 2020 Ivan Medennikov + +# Apache 2.0. +# +# This script performs 1st iteration of TS-VAD diarization +# using an initial diarization rttm to estimate i-vectors + +cmd="run.pl" +ref_rttm= +lang=data/lang + +#blstm processing parameters +extra_left_context=30 +extra_right_context=30 +frames_per_chunk=40 + +#post-processing parameters +thr=0.4 +window=51 +min_silence=0.3 +min_speech=0.2 + +nj=8 +nj_feats=2 +piece=10000 + +ivector_affix=baseline-init + +echo "$0 $@" # Print the command line for logging +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 4 ]; then + echo "Usage: $0 " + echo "e.g.: $0 exp/ts-vad exp/nnet3 dev_beamformit_dereverb_diarized exp/ts-vad/it1" + echo "Options: " + echo " --nj # number of parallel jobs." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --ref_rttm ./chime6_rttm/dev_rttm # the location of the reference RTTM file" + echo " --ivector_affix baseline-init # affix corresponding to the initial diarization" + echo " --piece 10000 # raw wavs will be splitted into non-overlapping pieces of this size (in frames)" + echo " --thr 0.4 # post-processing: probability threshold" + echo " --window 51 # post-processing: median filter window (in frames)" + echo " --min_silence 0.3 # post-processing: minimum length of silence (in seconds)" + echo " --min_speech 0.2 # post-processing: minimum length of speech (in seconds)" + exit 1; +fi + +dir=$1 +ivector_dir=$2 +initname=$3 +outdir=$4 + +test="$(cut -d'_' -f1 <<<"$initname")" + +#estimating i-vectors using the initial diarization +dset=${initname}_hires +ivdir=${ivector_dir}/ivectors_${dset}_${ivector_affix} +if [ ! -f $ivdir/ivector_online.scp ]; then + echo "Extracting i-vectors for $dset" + steps/online/nnet2/extract_ivectors.sh --cmd "$cmd" --nj $nj \ + --silence-weight 0.00001 \ + --sub-speaker-frames 0 --max-count 100 \ + data/$dset $lang $ivector_dir/extractor $ivdir || exit 1; +fi + +#preparing 4-speaker track2 data +dsetsrc=$dset +name=$(echo $initname | sed s/_diarized//) +dset=${name}_U06_hires +if [ ! -f data/$dset/.done ]; then + mkdir -p data/$dset + cp data/$dsetsrc/wav.scp data/$dset/wav.scp + awk '{print $1" "$1}' data/$dset/wav.scp > data/$dset/utt2spk + awk '{print $1" "$1}' data/$dset/wav.scp > data/$dset/spk2utt + utils/fix_data_dir.sh data/$dset + steps/make_mfcc.sh --nj $nj_feats --mfcc-config conf/mfcc_hires.conf data/$dset data/$dset/log data/$dset/data || exit 1; + touch data/$dset/.done +fi + +#splitting 4-speaker track2 data into pieces +dsetsrc=$dset +dset=${dset}_split${piece} +if [ ! -f data/$dset/.done ]; then + mkdir -p data/$dset + cp data/${dsetsrc}/wav.scp data/$dset + feat-to-len scp:data/$dsetsrc/feats.scp ark,t:data/$dsetsrc/utt2len + local/ts-vad/split_feats_seg.pl data/$dsetsrc/feats.scp data/$dsetsrc/utt2spk data/$dsetsrc/utt2len $piece data/$dset/feats.scp data/$dset/utt2spk data/$dset/segments + utils/utt2spk_to_spk2utt.pl data/$dset/utt2spk > data/$dset/spk2utt + utils/fix_data_dir.sh data/$dset + touch data/$dset/.done +fi + +#preparing 4-speaker i-vectors +iv4dir=${ivector_dir}/ivectors-4spk_${dset}_${ivector_affix} +if [ ! -f $iv4dir/.done ]; then + mkdir -p $iv4dir + echo "Making pseudo-online 4spk i-vectors using source $ivdir" + cat $ivdir/ivectors_spk.*.ark > $iv4dir/ivectors_spk.ark + + for spk in `seq 4`; do + awk -v "spk=$spk" '{printf "%s %s-%s\n", $1, $2, spk}' data/$dset/utt2spk > data/$dset/utt2spk.$spk + done + + $train_cmd JOB=1:4 $iv4dir/log/apply-map.JOB.log \ + utils/apply_map.pl -f 2 $iv4dir/ivectors_spk.ark \$iv4dir/ivectors_utt.JOB.ark || exit 1; + + ivector_dim=$[$(head -n 1 $ivdir/ivectors_spk.1.ark | wc -w) - 3] || exit 1; + base_feat_dim=$(feat-to-dim scp:data/$dset/feats.scp -) || exit 1; + start_dim=$base_feat_dim + end_dim=$[$base_feat_dim+$ivector_dim-1] + absdir=$(utils/make_absolute.sh $iv4dir) + cp $ivdir/{ivector_period,final.ie.id} $iv4dir/ + ivector_period=$(cat $ivdir/ivector_period) + + $cmd JOB=1:4 $iv4dir/log/duplicate_feats.JOB.log \ + append-vector-to-feats scp:data/$dset/feats.scp ark:$iv4dir/ivectors_utt.JOB.ark ark:- \| \ + select-feats "$start_dim-$end_dim" ark:- ark:- \| \ + subsample-feats --n=$ivector_period ark:- ark:- \| \ + copy-feats --compress=true ark:- \ + ark,scp:$absdir/ivector_online.JOB.ark,$absdir/ivector_online.JOB.scp || exit 1; + + $cmd $iv4dir/log/paste-feats.log \ + paste-feats scp:$iv4dir/ivector_online.1.scp scp:$iv4dir/ivector_online.2.scp scp:$iv4dir/ivector_online.3.scp scp:$iv4dir/ivector_online.4.scp ark:- \| \ + copy-feats --compress=true ark:- ark,scp:$absdir/ivector_online.ark,$absdir/ivector_online.scp || exit 1; + touch $iv4dir/.done +fi + +#computing TS-VAD per-frame probabilities for each speaker +out=$outdir/$dset +if [ ! -f $out/.done ]; then + local/ts-vad/compute_ts-vad_weights.sh --nj $nj_feats --use-gpu true --cmd "$cmd" --online-ivector-dir $iv4dir \ + --extra-left-context $extra_left_context --extra-right-context $extra_right_context --frames-per-chunk $frames_per_chunk \ + data/$dset $dir/final.raw $out || exit 1; + touch $out/.done +fi + +#TS-VAD probabilities post-processing and DER scoring +scoring=$out/scoring +hyp_rttm=$scoring/rttm +if [ ! -f $scoring/.done ]; then + if [ ! -f $hyp_rttm ]; then + python local/ts-vad/convert_prob_to_rttm.py --threshold $thr --window $window --min_silence $min_silence --min_speech $min_speech ark:"sort $out/weights.ark |" $hyp_rttm || exit 1; + fi + echo "Diarization results for $test" + [ ! -f $ref_rttm.scoring ] && sed 's/_U0[1-6]\.ENH//g' $ref_rttm > $ref_rttm.scoring + [ ! -f $hyp_rttm.scoring ] && sed 's/_U0[1-6]\.ENH//g' $hyp_rttm > $hyp_rttm.scoring + ref_rttm_path=$(readlink -f ${ref_rttm}.scoring) + hyp_rttm_path=$(readlink -f ${hyp_rttm}.scoring) + [ ! -f ./local/uem_file.scoring ] && cat ./local/uem_file | grep 'U06' | sed 's/_U0[1-6]//g' > ./local/uem_file.scoring + cd dscore && python score.py -u ../local/uem_file.scoring -r $ref_rttm_path \ + -s $hyp_rttm_path 2>&1 | tee -a ../$scoring/DER && cd .. || exit 1; + touch $scoring/.done +fi diff --git a/egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it2.sh b/egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it2.sh new file mode 100755 index 00000000000..6d99428181a --- /dev/null +++ b/egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it2.sh @@ -0,0 +1,209 @@ +#!/bin/bash +#!/bin/bash +# Copyright 2020 Ivan Medennikov + +# Apache 2.0. +# +# This script performs 2nd and further iterations of TS-VAD diarization +# on a set of kinect channels followed by averaging. +# Probabilities from the previous iteration are used to estimate i-vectors + +cmd="run.pl" +ref_rttm= +lang=data/lang +audio_dir=CHiME6/audio + +#blstm processing parameters +extra_left_context=30 +extra_right_context=30 +frames_per_chunk=40 + +#post-processing parameters +thr=0.4 +window=51 +min_silence=0.3 +min_speech=0.2 + +nj=8 +nj_feats=2 +piece=10000 #raw wavs will be splitted into non-overlapping pieces of this size (in frames) +ups=18 +wpeid= +channels="CH1 CH2 CH3 CH4" + +#parameters for modification of initial weights +t=0 +mt=0.7 + +it=2 +ivector_affix=it1-init + +echo "$0 $@" # Print the command line for logging +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 4 ]; then + echo "Usage: $0 " + echo "e.g.: $0 exp/ts-vad exp/nnet3 exp/ts-vad/it1 exp/ts-vad/it2" + echo "Options: " + echo " --nj # number of parallel jobs." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --ref_rttm ./chime6_rttm/dev_rttm # the location of the reference RTTM file" + echo " --it 2 # current iteration of TS-VAD" + echo " --ivector_affix it1-init # affix corresponding to the initial weights" + echo " --channels CH1 CH2 CH3 CH4 # kinect channels to be processed" + echo " --audio_dir CHiME6/audio # path to wav files" + echo " --wpeid WPE2m # affix for non-original wavs, e.g., blockwise WPE processed" + echo " --piece 10000 # raw wavs will be splitted into non-overlapping pieces of this size (in frames)" + echo " --ups 18 # number of pieces considered as one speaker" + echo " --t 0 # absolute threshold for initial weights" + echo " --mt 0.7 # relative threshold for pi/(p1+p2+p3+p4) in initial weights (to exclude overlapping regions from i-vectors estimation)" + echo " --thr 0.4 # post-processing: probability threshold" + echo " --window 51 # post-processing: median filter window (in frames)" + echo " --min_silence 0.3 # post-processing: minimum length of silence (in seconds)" + echo " --min_speech 0.2 # post-processing: minimum length of speech (in seconds)" + exit 1; +fi + +dir=$1 +ivector_dir=$2 +initdir=$3 +outdir=$4 + +initname=$(basename $initdir) +test="$(cut -d'_' -f1 <<<"$initname")" + +weights=$initdir/weights.ark +weights_mod=$initdir/weights_t${t}_mt${mt}.ark +if [ ! -f ${weights_mod}.gz ]; then + python local/ts-vad/vad_prob_mod.py --threshold $t --multispk_threshold $mt ark:$weights ark,t:${weights_mod} + cat ${weights_mod} | sed s/_U06.ENH// | sort | gzip -c > ${weights_mod}.gz + rm $weights_mod +fi +for spk in `seq 4`; do + [ ! -f ${weights_mod}.${spk}.gz ] && gunzip -c ${weights_mod}.gz | grep "\-$spk\ " | sed s/\-$spk\ /\ / | gzip -c > ${weights_mod}.${spk}.gz +done + +kinects="U01 U02 U03 U04 U05 U06" +[ "$test" == "dev" ] && kinects="U01 U02 U03 U04 U06" +[ "$test" == "eval" ] && kinects="U01 U02 U04 U05 U06" + +sum_scps="" +n=0 +for u in $kinects; do + for ch in $channels; do + id=${u}.${ch}${wpeid} + echo "processing $id" + + dset=${test}_${id}_hires + if [ ! -f data/$dset/.done ]; then + mkdir -p data/$dset + ls $audio_dir/$test/ | grep "wav" | grep "$u" | grep "$ch" | awk -v "pth=$audio_dir/$test" '{printf "%s %s/%s\n", $1, pth, $1}' | sed -E s/_[^\ ]+// > data/$dset/wav.scp + awk '{print $1" "$1}' data/$dset/wav.scp > data/$dset/utt2spk + awk '{print $1" "$1}' data/$dset/wav.scp > data/$dset/spk2utt + utils/fix_data_dir.sh data/$dset + steps/make_mfcc.sh --nj $nj_feats --mfcc-config conf/mfcc_hires.conf data/$dset data/$dset/log data/$dset/data || exit 1; + touch data/$dset/.done + fi + + dsetsrc=$dset + dset=${dset}_split${piece} + if [ ! -f data/$dset/.done ]; then + mkdir -p data/$dset + cp data/${dsetsrc}/wav.scp data/$dset + feat-to-len scp:data/$dsetsrc/feats.scp ark,t:data/$dsetsrc/utt2len + local/ts-vad/split_feats_seg.pl data/$dsetsrc/feats.scp data/$dsetsrc/utt2spk data/$dsetsrc/utt2len $piece data/$dset/feats.scp data/$dset/utt2spk data/$dset/segments + utils/utt2spk_to_spk2utt.pl data/$dset/utt2spk > data/$dset/spk2utt + utils/fix_data_dir.sh data/$dset + touch data/$dset/.done + fi + + dsetsrc=$dset + dset=${dset}_${ups}ups + if [ ! -f data/$dset/.done ]; then + utils/copy_data_dir.sh data/$dsetsrc data/$dset + local/ts-vad/modify_ups_utt2spk.pl data/$dsetsrc/utt2spk $ups data/$dset/utt2spk + utils/utt2spk_to_spk2utt.pl data/$dset/utt2spk > data/$dset/spk2utt + utils/fix_data_dir.sh data/$dset + touch data/$dset/.done + fi + + ivdir=${ivector_dir}/${test}_${ivector_affix}/ivectors_${dset} + for spk in `seq 4`; do + if [ ! -f $ivdir/$spk/ivector_online.scp ]; then + echo "Extracting i-vectors for $dset" + steps/online/nnet2/extract_ivectors.sh --cmd "$decode_cmd" --nj $nj \ + --silence-weight 0.00001 \ + --sub-speaker-frames 0 --max-count 100 \ + data/$dset $lang $ivector_dir/extractor ${weights_mod}.${spk}.gz $ivdir/$spk || exit 1; + fi + done + + iv4dir=${ivector_dir}/${test}_${ivector_affix}/ivectors-4spk_${dset} + if [ ! -f $iv4dir/.done ]; then + mkdir -p $iv4dir + echo "Making pseudo-online 4spk i-vectors using source $ivdir" + for spk in `seq 4`; do + cat $ivdir/$spk/ivectors_spk.*.ark > $iv4dir/ivectors_spk.$spk.ark + done + $train_cmd JOB=1:4 $iv4dir/log/apply-map.JOB.log \ + utils/apply_map.pl -f 2 $iv4dir/ivectors_spk.JOB.ark \$iv4dir/ivectors_utt.JOB.ark || exit 1; + + ivector_dim=$[$(head -n 1 $iv4dir/ivectors_spk.1.ark | wc -w) - 3] || exit 1; + base_feat_dim=$(feat-to-dim scp:data/$dset/feats.scp -) || exit 1; + start_dim=$base_feat_dim + end_dim=$[$base_feat_dim+$ivector_dim-1] + absdir=$(utils/make_absolute.sh $iv4dir) + cp $ivdir/1/{ivector_period,final.ie.id} $iv4dir/ + ivector_period=$(cat $iv4dir/ivector_period) + + $train_cmd JOB=1:4 $iv4dir/log/duplicate_feats.JOB.log \ + append-vector-to-feats scp:data/$dset/feats.scp ark:$iv4dir/ivectors_utt.JOB.ark ark:- \| \ + select-feats "$start_dim-$end_dim" ark:- ark:- \| \ + subsample-feats --n=$ivector_period ark:- ark:- \| \ + copy-feats --compress=true ark:- \ + ark,scp:$absdir/ivector_online.JOB.ark,$absdir/ivector_online.JOB.scp || exit 1; + + $train_cmd $iv4dir/log/paste-feats.log \ + paste-feats scp:$iv4dir/ivector_online.1.scp scp:$iv4dir/ivector_online.2.scp scp:$iv4dir/ivector_online.3.scp scp:$iv4dir/ivector_online.4.scp ark:- \| \ + copy-feats --compress=true ark:- ark,scp:$absdir/ivector_online.ark,$absdir/ivector_online.scp || exit 1; + touch $iv4dir/.done + fi + + out=$outdir/$dset + if [ ! -f $out/.done ]; then + local/ts-vad/compute_ts-vad_weights.sh --nj $nj --use-gpu true --cmd "$decode_cmd" --online-ivector-dir $iv4dir \ + --extra-left-context $extra_left_context --extra-right-context $extra_right_context --frames-per-chunk $frames_per_chunk \ + data/$dset $dir/final.raw $out || exit 1; + touch $out/.done + fi + sum_scps="${sum_scps}ark:$out/weights.ark " + n=$((n+1)) + done +done + +id=${n}ch-AVG${wpeid} +dset=${test}_${id}_hires_split${piece}_${ups}ups +out=$outdir/$dset +if [ ! -f $out/.done ]; then + scale=$(awk -v "n=$n" 'BEGIN {print 1/n}') + $train_cmd $out/log/vector-sum.log \ + vector-sum $sum_scps ark:- \| vector-scale --scale=$scale ark:- ark,t:$out/weights.ark || exit 1; + touch $out/.done +fi + +scoring=$out/scoring +hyp_rttm=$scoring/rttm +if [ ! -f $scoring/.done ]; then + if [ ! -f $hyp_rttm ]; then + python local/ts-vad/convert_prob_to_rttm.py --threshold $thr --window $window --min_silence $min_silence --min_speech $min_speech ark:"sort $out/weights.ark |" $hyp_rttm || exit 1; + fi + echo "Diarization results for $test" + [ ! -f $ref_rttm.scoring ] && sed 's/_U0[1-6]\.ENH//g' $ref_rttm > $ref_rttm.scoring + [ ! -f $hyp_rttm.scoring ] && sed 's/_U0[1-6]\.ENH//g' $hyp_rttm > $hyp_rttm.scoring + ref_rttm_path=$(readlink -f ${ref_rttm}.scoring) + hyp_rttm_path=$(readlink -f ${hyp_rttm}.scoring) + [ ! -f ./local/uem_file.scoring ] && cat ./local/uem_file | grep 'U06' | sed 's/_U0[1-6]//g' > ./local/uem_file.scoring + cd dscore && python score.py -u ../local/uem_file.scoring -r $ref_rttm_path \ + -s $hyp_rttm_path 2>&1 | tee -a ../$scoring/DER && cd .. || exit 1; + touch $scoring/.done +fi diff --git a/egs/chime6/s5b_track2/local/ts-vad/modify_ups_utt2spk.pl b/egs/chime6/s5b_track2/local/ts-vad/modify_ups_utt2spk.pl new file mode 100755 index 00000000000..9deb0dbacf4 --- /dev/null +++ b/egs/chime6/s5b_track2/local/ts-vad/modify_ups_utt2spk.pl @@ -0,0 +1,40 @@ +#!/usr/bin/perl + +($filein,$ups,$fileout)=@ARGV; + +open(fidin, "<$filein") or die "cant open $filein : $!"; +open(fidout, ">$fileout") or die "cant open $fileout : $!"; +%utt2spk={}; +%spk2utt={}; +while ($line=) +{ + $line=~s/\s+$//; + @items=split(/\s+/,$line); + $utt=$items[0]; + $spk=$items[1]; + push (@{$spk2utt{$spk}},$utt); +} +close(fidin); + +foreach $spk (sort keys %{spk2utt}) +{ + $i=0; + $num=scalar @{$spk2utt{$spk}}; + foreach $utt (sort @{$spk2utt{$spk}}) + { + $sid=1+int($i/$ups); + if ($ups*$sid > $num) + { + $sid-=1; + } + if ($sid < 10) + { + $sid="0$sid"; + } + print fidout "$utt $spk-$sid\n"; + $i+=1; + } +} +close(fidin); +close(fidout); +exit 0; \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/ts-vad/split_feats_seg.pl b/egs/chime6/s5b_track2/local/ts-vad/split_feats_seg.pl new file mode 100755 index 00000000000..9f1ce0342a8 --- /dev/null +++ b/egs/chime6/s5b_track2/local/ts-vad/split_feats_seg.pl @@ -0,0 +1,71 @@ +#!/usr/bin/perl + +($filein,$utt2spk,$utt2dur,$chunk,$fileout,$fileout2,$fileout3)=@ARGV; + +%utt2dur={}; +open(fidin, "<$utt2dur") or die "cant open $utt2dur : $!"; +while ($line=) +{ + $line=~s/\s+$//; + @items=split(/\s+/,$line); + $utt2dur{$items[0]}=$items[1]; +} +close(fidin); + +%utt2spk={}; +open(fidin, "<$utt2spk") or die "cant open $utt2spk : $!"; +while ($line=) +{ + $line=~s/\s+$//; + @items=split(/\s+/,$line); + $utt2spk{$items[0]}=$items[1]; +} +close(fidin); + + +open(fidin, "<$filein") or die "cant open $filein : $!"; +open(fidout, ">$fileout") or die "cant open $fileout : $!"; +open(fidout2, ">$fileout2") or die "cant open $fileout2 : $!"; +open(fidout3, ">$fileout3") or die "cant open $fileout3 : $!"; + +while ($line=) +{ + $line=~s/\s+$//; + @items=split(/\s+/,$line); + $begin=0; + $end=$begin+$chunk-1; + $id=1; + $suffix=$id; + while ($begin < $utt2dur{$items[0]}) + { + $end=$begin+$chunk-1; + if ($end > $utt2dur{$items[0]}-1) + { + $end = $utt2dur{$items[0]}-1; + } + if ($id < 1000) + { + $suffix="0$id"; + } + if ($id < 100) + { + $suffix="00$id"; + } + if ($id < 10) + { + $suffix="000$id"; + } + print fidout "$items[0]-$suffix $items[1]\[$begin:$end\]\n"; + print fidout2 "$items[0]-$suffix $utt2spk{$items[0]}\n"; + $begin_sec=$begin/100.0; + $end_sec=$end/100.0; + print fidout3 "$items[0]-$suffix $items[0] $begin_sec $end_sec\n"; + $begin=$begin+$chunk; + $id=$id+1; + } +} +close(fidin); +close(fidout); +close(fidout2); +close(fidout3); +exit 0; \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/ts-vad/vad_prob_mod.py b/egs/chime6/s5b_track2/local/ts-vad/vad_prob_mod.py new file mode 100644 index 00000000000..38fe94a383e --- /dev/null +++ b/egs/chime6/s5b_track2/local/ts-vad/vad_prob_mod.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Ivan Medennikov +# Apache 2.0. + +"""This script modifies TS-VAD output probabilities applying +absolute threshold (--threshold) and relative threshold (--multispk_threshold) for pi/(p1+p2+p3+p4) +(to exclude overlapping regions from i-vectors estimation)""" + +import os +import argparse +import regex as re +import numpy as np +from scipy import signal, ndimage +from kaldiio import ReadHelper, WriteHelper + +class WeightsSet: + def __init__(self, vad_rspec, reg_exp): + data = dict() + prev = -1 + with ReadHelper(vad_rspec) as reader: + for utid, align in reader: + result = reg_exp.match(utid) + assert result is not None, 'Wrong VAD alignment utterance ID format: \"{}\"'.format(utid) + sess = result.group(1) + piece = result.group(2) + spkr = result.group(3) + if sess not in data.keys(): + data[sess] = dict() + if piece not in data[sess].keys(): + data[sess][piece] = dict() + data[sess][piece][spkr]=align + reader.close() + print(' loaded {} sessions'.format(len(data))) + self.data = data + + def modify_prob(self, threshold, multispk_threshold, lowest_value): + for sess in self.data.keys(): + for piece in self.data[sess].keys(): + maxlen=0 + longest="" + for spkr in self.data[sess][piece].keys(): + if (len(self.data[sess][piece][spkr]) > maxlen): + maxlen=len(self.data[sess][piece][spkr]) + longest=spkr + sumprob=self.data[sess][piece][longest].copy() + for spkr in self.data[sess][piece].keys(): + if spkr == longest: + continue + for i in range(len(self.data[sess][piece][spkr])): + sumprob[i]+=self.data[sess][piece][spkr][i] + for spkr in self.data[sess][piece].keys(): + for i in range(len(self.data[sess][piece][spkr])): + if (self.data[sess][piece][spkr][i] < threshold): + self.data[sess][piece][spkr][i]=lowest_value + for spkr in self.data[sess][piece].keys(): + for i in range(len(self.data[sess][piece][spkr])): + if (self.data[sess][piece][spkr][i]/sumprob[i] < multispk_threshold): + self.data[sess][piece][spkr][i]=lowest_value + + + def write(self, vad_wspec): + with WriteHelper(vad_wspec) as writer: + for sess in self.data.keys(): + for piece in self.data[sess].keys(): + for spkr in self.data[sess][piece].keys(): + utt=sess+'-'+piece+'-'+spkr + writer(utt, self.data[sess][piece][spkr]) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Usage: vad_prob_mod.py ') + parser.add_argument("--reg_exp", "-x", type=str, default=r'^(S\d\d.*)\-(\d+)\-(\d)$') + parser.add_argument("--threshold", "-t", type=float, default=0.0) + parser.add_argument("--multispk_threshold", "-mt", type=float, default=0.8) + parser.add_argument("--lowest_value", "-l", type=float, default=0.00001) + parser.add_argument('vad_rspec', type=str) + parser.add_argument('vad_wspec', type=str) + args = parser.parse_args() + + print('Options:') + print(' Utterance ID regexp: {}'.format(args.reg_exp)) + print(' Absolute threshold: {}'.format(args.threshold)) + print(' Multispeaker threshold for Pi/(P1+P2+P3+P4): {}'.format(args.multispk_threshold)) + print(' Lowest value which is used when applying the thresholds: {}'.format(args.lowest_value)) + print(' VAD rspec: {}'.format(args.vad_rspec)) + print(' VAD wspec: {}'.format(args.vad_wspec)) + + reg_exp = re.compile(args.reg_exp) + + print('Loading VAD probabilities') + vad_align = WeightsSet(args.vad_rspec, reg_exp) + + print('Modifying VAD probabilities') + vad_align.modify_prob(args.threshold, args.multispk_threshold, args.lowest_value) + + print('Writing VAD probabilities') + vad_align.write(args.vad_wspec) diff --git a/egs/chime6/s5b_track2/run.sh b/egs/chime6/s5b_track2/run.sh index d5548518287..5e278e17da7 100755 --- a/egs/chime6/s5b_track2/run.sh +++ b/egs/chime6/s5b_track2/run.sh @@ -24,12 +24,21 @@ num_data_reps=4 snrs="20:10:15:5:0" foreground_snrs="20:10:15:5:0" background_snrs="20:10:15:5:0" + +# pre-trained TS-VAD model +ts_vad_name=ts-vad_b.tar.gz +ts_vad_link=https://github.com/yuri-hohlov/ts-vad-data/raw/master/${ts_vad_name} +ts_vad_dir=exp/ts-vad_b +ivector_dir=exp/nnet3_b + # End configuration section . ./utils/parse_options.sh . ./cmd.sh . ./path.sh +[ ! -f $ts_vad_archive ] && wget + if [ $decode_only == "true" ]; then stage=18 fi @@ -51,7 +60,7 @@ sad_train_set=train_worn_u400k test_sets="dev_${enhancement}_dereverb eval_${enhancement}_dereverb" # This script also needs the phonetisaurus g2p, srilm, beamformit -./local/check_tools.sh || exit 1; +#./local/check_tools.sh || exit 1; ########################################################################### # We first generate the synchronized audio files across arrays and @@ -287,11 +296,14 @@ fi ########################################################################## # DECODING: In track 2, we are given raw utterances without segment # or speaker information, so we have to decode the whole pipeline, i.e., -# SAD -> Diarization -> ASR. This is done in the local/decode.sh -# script. +# SAD -> Diarization (x-vectors clustering) -> TS-VAD Diarization -> ASR. +# This is done in the local/decode_ts-vad.sh script. ########################################################################## if [ $stage -le 18 ]; then - local/decode.sh --stage $decode_stage \ + [ ! -f $ts_vad_name ] && wget -O $ts_vad_name $ts_vad_link + [ ! -d $ts_vad_dir ] && tar -zxvf $ts_vad_name -C $(dirname $ts_vad_dir) + local/decode_ts-vad.sh --stage $decode_stage \ + --ts-vad-dir $ts_vad_dir --ivector-dir $ivector_dir \ --enhancement $enhancement \ --test-sets "$test_sets" fi From 418a42f4d4463f0a5d8a591e51f973b5929a459f Mon Sep 17 00:00:00 2001 From: medennikov Date: Sun, 31 May 2020 22:14:44 +0300 Subject: [PATCH 03/10] small fix in run.sh; remove redundant files; update RESULTS file --- egs/chime6/s5b_track2/RESULTS | 9 +- egs/chime6/s5b_track2/local/decode.sh | 217 ------------------ .../s5b_track2/local/decode_diarized.sh | 74 ------ egs/chime6/s5b_track2/run.sh | 2 +- 4 files changed, 8 insertions(+), 294 deletions(-) delete mode 100755 egs/chime6/s5b_track2/local/decode.sh delete mode 100755 egs/chime6/s5b_track2/local/decode_diarized.sh diff --git a/egs/chime6/s5b_track2/RESULTS b/egs/chime6/s5b_track2/RESULTS index 131b43cecf8..18d7f53e4a6 100644 --- a/egs/chime6/s5b_track2/RESULTS +++ b/egs/chime6/s5b_track2/RESULTS @@ -18,6 +18,11 @@ Dev (new RTTM) 63.42 70.83 Eval (old RTTM) 61.96 71.40 Eval (new RTTM) 68.20 72.54 +# Diarization (3 iterations of TS-VAD) + DER JER +Dev (new RTTM) 45.90 52.45 +Eval (new RTTM) 49.13 55.83 + # ASR nnet3 tdnn+chain -Dev: %WER 84.25 [ 49610 / 58881, 1937 ins, 34685 del, 12988 sub ] -Eval: %WER 77.94 [ 42971 / 55132, 1086 ins, 30839 del, 11046 sub ] +Dev: %WER 77.34 [ 45538 / 58881, 1099 ins, 24840 del, 19599 sub ] +Eval: %WER 71.90 [ 39639 / 55132, 1550 ins, 23748 del, 14341 sub ] diff --git a/egs/chime6/s5b_track2/local/decode.sh b/egs/chime6/s5b_track2/local/decode.sh deleted file mode 100755 index 8f094f5c4df..00000000000 --- a/egs/chime6/s5b_track2/local/decode.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/usr/bin/env bash -# -# This script decodes raw utterances through the entire pipeline: -# Feature extraction -> SAD -> Diarization -> ASR -# -# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal) -# 2019 Desh Raj, David Snyder, Ashish Arora, Zhaoheng Ni -# Apache 2.0 - -# Begin configuration section. -nj=8 -stage=0 -sad_stage=0 -score_sad=true -diarizer_stage=0 -decode_diarize_stage=0 -score_stage=0 - -enhancement=beamformit - -# option to use the new RTTM reference for sad and diarization -use_new_rttm_reference=false -if $use_new_rttm_reference == "true"; then - git clone https://github.com/nateanl/chime6_rttm -fi - -# chime5 main directory path -# please change the path accordingly -chime5_corpus=/export/corpora4/CHiME5 -# chime6 data directories, which are generated from ${chime5_corpus}, -# to synchronize audio files across arrays and modify the annotation (JSON) file accordingly -chime6_corpus=${PWD}/CHiME6 -json_dir=${chime6_corpus}/transcriptions -audio_dir=${chime6_corpus}/audio - -enhanced_dir=enhanced -enhanced_dir=$(utils/make_absolute.sh $enhanced_dir) || exit 1 - -# training data -train_set=train_worn_simu_u400k -test_sets="dev_${enhancement}_dereverb eval_${enhancement}_dereverb" - -. ./utils/parse_options.sh - -. ./cmd.sh -. ./path.sh -. ./conf/sad.conf - -# This script also needs the phonetisaurus g2p, srilm, beamformit -./local/check_tools.sh || exit 1 - -########################################################################### -# We first generate the synchronized audio files across arrays and -# corresponding JSON files. Note that this requires sox v14.4.2, -# which is installed via miniconda in ./local/check_tools.sh -########################################################################### - -if [ $stage -le 0 ]; then - local/generate_chime6_data.sh \ - --cmd "$train_cmd" \ - ${chime5_corpus} \ - ${chime6_corpus} -fi - -####################################################################### -# Prepare the dev and eval data with dereverberation (WPE) and -# beamforming. -####################################################################### -if [ $stage -le 1 ]; then - # Beamforming using reference arrays - # enhanced WAV directory - enhandir=enhan - dereverb_dir=${PWD}/wav/wpe/ - - for dset in dev eval; do - for mictype in u01 u02 u03 u04 u06; do - local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 20G" \ - ${audio_dir}/${dset} \ - ${dereverb_dir}/${dset} \ - ${mictype} - done - done - - for dset in dev eval; do - for mictype in u01 u02 u03 u04 u06; do - local/run_beamformit.sh --cmd "$train_cmd" \ - ${dereverb_dir}/${dset} \ - ${enhandir}/${dset}_${enhancement}_${mictype} \ - ${mictype} - done - done - - # Note that for the evaluation sets, we use the flag - # "--train false". This keeps the files segments, text, - # and utt2spk with .bak extensions, so that they can - # be used later for scoring if needed but are not used - # in the intermediate stages. - for dset in dev eval; do - local/prepare_data.sh --mictype ref --train false \ - "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \ - ${json_dir}/${dset} data/${dset}_${enhancement}_dereverb - done - -fi - -if [ $stage -le 2 ]; then - # mfccdir should be some place with a largish disk where you - # want to store MFCC features. - mfccdir=mfcc - for x in ${test_sets}; do - steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \ - --mfcc-config conf/mfcc_hires.conf \ - data/$x exp/make_mfcc/$x $mfccdir - done -fi - -####################################################################### -# Perform SAD on the dev/eval data -####################################################################### -dir=exp/segmentation${affix} -sad_work_dir=exp/sad${affix}_${nnet_type}/ -sad_nnet_dir=$dir/tdnn_${nnet_type}_sad_1a - -if [ $stage -le 3 ]; then - for datadir in ${test_sets}; do - test_set=data/${datadir} - if [ ! -f ${test_set}/wav.scp ]; then - echo "$0: Not performing SAD on ${test_set}" - exit 0 - fi - # Perform segmentation - local/segmentation/detect_speech_activity.sh --nj $nj --stage $sad_stage \ - $test_set $sad_nnet_dir mfcc $sad_work_dir \ - data/${datadir} || exit 1 - - test_dir=data/${datadir}_${nnet_type}_seg - mv data/${datadir}_seg ${test_dir}/ - cp data/${datadir}/{segments.bak,utt2spk.bak} ${test_dir}/ - # Generate RTTM file from segmentation performed by SAD. This can - # be used to evaluate the performance of the SAD as an intermediate - # step. - steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ - ${test_dir}/utt2spk ${test_dir}/segments ${test_dir}/rttm - - if [ $score_sad == "true" ]; then - echo "Scoring $datadir.." - # We first generate the reference RTTM from the backed up utt2spk and segments - # files. - ref_rttm=${test_dir}/ref_rttm - steps/segmentation/convert_utt2spk_and_segments_to_rttm.py ${test_dir}/utt2spk.bak \ - ${test_dir}/segments.bak ${test_dir}/ref_rttm - - # To score, we select just U06 segments from the hypothesis RTTM. - hyp_rttm=${test_dir}/rttm.U06 - grep 'U06' ${test_dir}/rttm > ${test_dir}/rttm.U06 - echo "Array U06 selected for scoring.." - - if $use_new_rttm_reference == "true"; then - echo "Use the new RTTM reference." - mode="$(cut -d'_' -f1 <<<"$datadir")" - ref_rttm=./chime6_rttm/${mode}_rttm - fi - - sed 's/_U0[1-6].ENH//g' $ref_rttm > $ref_rttm.scoring - sed 's/_U0[1-6].ENH//g' $hyp_rttm > $hyp_rttm.scoring - cat ./local/uem_file | grep 'U06' | sed 's/_U0[1-6]//g' > ./local/uem_file.tmp - md-eval.pl -1 -c 0.25 -u ./local/uem_file.tmp -r $ref_rttm.scoring -s $hyp_rttm.scoring |\ - awk 'or(/MISSED SPEECH/,/FALARM SPEECH/)' - fi - done -fi - -####################################################################### -# Perform diarization on the dev/eval data -####################################################################### -if [ $stage -le 4 ]; then - for datadir in ${test_sets}; do - if $use_new_rttm_reference == "true"; then - mode="$(cut -d'_' -f1 <<<"$datadir")" - ref_rttm=./chime6_rttm/${mode}_rttm - else - ref_rttm=data/${datadir}_${nnet_type}_seg/ref_rttm - fi - local/diarize.sh --nj $nj --cmd "$train_cmd" --stage $diarizer_stage \ - --ref-rttm $ref_rttm \ - exp/xvector_nnet_1a \ - data/${datadir}_${nnet_type}_seg \ - exp/${datadir}_${nnet_type}_seg_diarization - done -fi - -####################################################################### -# Decode diarized output using trained chain model -####################################################################### -if [ $stage -le 5 ]; then - for datadir in ${test_sets}; do - local/decode_diarized.sh --nj $nj --cmd "$decode_cmd" --stage $decode_diarize_stage \ - exp/${datadir}_${nnet_type}_seg_diarization data/$datadir data/lang \ - exp/chain_${train_set}_cleaned_rvb exp/nnet3_${train_set}_cleaned_rvb \ - data/${datadir}_diarized || exit 1 - done -fi - -####################################################################### -# Score decoded dev/eval sets -####################################################################### -if [ $stage -le 6 ]; then - # final scoring to get the challenge result - # please specify both dev and eval set directories so that the search parameters - # (insertion penalty and language model weight) will be tuned using the dev set - local/score_for_submit.sh --stage $score_stage \ - --dev_decodedir exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_dereverb_diarized_2stage \ - --dev_datadir dev_beamformit_dereverb_diarized_hires \ - --eval_decodedir exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_eval_beamformit_dereverb_diarized_2stage \ - --eval_datadir eval_beamformit_dereverb_diarized_hires -fi -exit 0; diff --git a/egs/chime6/s5b_track2/local/decode_diarized.sh b/egs/chime6/s5b_track2/local/decode_diarized.sh deleted file mode 100755 index f687b313893..00000000000 --- a/egs/chime6/s5b_track2/local/decode_diarized.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2019 Ashish Arora, Vimal Manohar -# Apache 2.0. -# This script takes an rttm file, and performs decoding on on a test directory. -# The output directory contains a text file which can be used for scoring. - - -stage=0 -nj=8 -cmd=queue.pl -echo "$0 $@" # Print the command line for logging -if [ -f path.sh ]; then . ./path.sh; fi -. parse_options.sh || exit 1; -if [ $# != 6 ]; then - echo "Usage: $0 " - echo "e.g.: $0 data/rttm data/dev data/lang_chain exp/chain_train_worn_simu_u400k_cleaned_rvb \ - exp/nnet3_train_worn_simu_u400k_cleaned_rvb data/dev_diarized" - echo "Options: " - echo " --nj # number of parallel jobs." - echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." - exit 1; -fi - -rttm_dir=$1 -data_in=$2 -lang_dir=$3 -asr_model_dir=$4 -ivector_extractor=$5 -out_dir=$6 - -for f in $rttm_dir/rttm $data_in/wav.scp $data_in/text.bak \ - $lang_dir/L.fst $asr_model_dir/tree_sp/graph/HCLG.fst \ - $asr_model_dir/tdnn1b_sp/final.mdl; do - [ ! -f $f ] && echo "$0: No such file $f" && exit 1; -done - -if [ $stage -le 0 ]; then - echo "$0 copying data files in output directory" - cp $rttm_dir/rttm $rttm_dir/rttm_1 - sed -i 's/'.ENH'/''/g' $rttm_dir/rttm_1 - # removing participant introduction from the hypothesis rttm - # UEM file contains the scoring durations for each recording - local/truncate_rttm.py $rttm_dir/rttm_1 local/uem_file $rttm_dir/rttm_introduction_removed - mkdir -p ${out_dir}_hires - cp ${data_in}/{wav.scp,utt2spk} ${out_dir}_hires - utils/data/get_reco2dur.sh ${out_dir}_hires -fi - -if [ $stage -le 1 ]; then - echo "$0 creating segments file from rttm and utt2spk, reco2file_and_channel " - local/convert_rttm_to_utt2spk_and_segments.py --append-reco-id-to-spkr=true $rttm_dir/rttm_introduction_removed \ - <(awk '{print $2".ENH "$2" "$3}' $rttm_dir/rttm_introduction_removed |sort -u) \ - ${out_dir}_hires/utt2spk ${out_dir}_hires/segments - - utils/utt2spk_to_spk2utt.pl ${out_dir}_hires/utt2spk > ${out_dir}_hires/spk2utt - - awk '{print $1" "$1" 1"}' ${out_dir}_hires/wav.scp > ${out_dir}_hires/reco2file_and_channel - utils/fix_data_dir.sh ${out_dir}_hires || exit 1; -fi - -if [ $stage -le 2 ]; then - echo "$0 extracting mfcc freatures using segments file" - steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd queue.pl ${out_dir}_hires - steps/compute_cmvn_stats.sh ${out_dir}_hires - cp $data_in/text.bak ${out_dir}_hires/text -fi - -if [ $stage -le 3 ]; then - echo "$0 performing decoding on the extracted features" - local/nnet3/decode.sh --affix 2stage --acwt 1.0 --post-decode-acwt 10.0 \ - --frames-per-chunk 150 --nj $nj --ivector-dir $ivector_extractor \ - $out_dir $lang_dir $asr_model_dir/tree_sp/graph $asr_model_dir/tdnn1b_sp/ -fi - diff --git a/egs/chime6/s5b_track2/run.sh b/egs/chime6/s5b_track2/run.sh index 5e278e17da7..55e1bd3e1e1 100755 --- a/egs/chime6/s5b_track2/run.sh +++ b/egs/chime6/s5b_track2/run.sh @@ -60,7 +60,7 @@ sad_train_set=train_worn_u400k test_sets="dev_${enhancement}_dereverb eval_${enhancement}_dereverb" # This script also needs the phonetisaurus g2p, srilm, beamformit -#./local/check_tools.sh || exit 1; +./local/check_tools.sh || exit 1; ########################################################################### # We first generate the synchronized audio files across arrays and From db536bb63d46d797e627ba874ec9fb791569503b Mon Sep 17 00:00:00 2001 From: medennikov Date: Mon, 15 Jun 2020 11:05:01 +0300 Subject: [PATCH 04/10] Add spectral clustering --- egs/chime6/s5b_track2/RESULTS | 13 +- egs/chime6/s5b_track2/local/decode_ts-vad.sh | 13 +- egs/chime6/s5b_track2/local/diarize_sc.sh | 118 +++++++++++ .../spectral_clustering/calc_cossim_scores.py | 66 ++++++ .../local/spectral_clustering/scluster.sh | 111 ++++++++++ .../local/spectral_clustering/score_cossim.sh | 81 +++++++ .../local/spectral_clustering/spec_clust.py | 197 ++++++++++++++++++ egs/chime6/s5b_track2/run.sh | 4 +- 8 files changed, 593 insertions(+), 10 deletions(-) create mode 100755 egs/chime6/s5b_track2/local/diarize_sc.sh create mode 100644 egs/chime6/s5b_track2/local/spectral_clustering/calc_cossim_scores.py create mode 100755 egs/chime6/s5b_track2/local/spectral_clustering/scluster.sh create mode 100755 egs/chime6/s5b_track2/local/spectral_clustering/score_cossim.sh create mode 100644 egs/chime6/s5b_track2/local/spectral_clustering/spec_clust.py diff --git a/egs/chime6/s5b_track2/RESULTS b/egs/chime6/s5b_track2/RESULTS index 18d7f53e4a6..abb5f845b87 100644 --- a/egs/chime6/s5b_track2/RESULTS +++ b/egs/chime6/s5b_track2/RESULTS @@ -18,11 +18,16 @@ Dev (new RTTM) 63.42 70.83 Eval (old RTTM) 61.96 71.40 Eval (new RTTM) 68.20 72.54 +# Diarization (Spectral Clustering) + DER JER +Dev (new RTTM) 59.03 61.94 +Eval (new RTTM) 64.67 63.36 + # Diarization (3 iterations of TS-VAD) DER JER -Dev (new RTTM) 45.90 52.45 -Eval (new RTTM) 49.13 55.83 +Dev (new RTTM) 45.29 52.24 +Eval (new RTTM) 41.33 44.83 # ASR nnet3 tdnn+chain -Dev: %WER 77.34 [ 45538 / 58881, 1099 ins, 24840 del, 19599 sub ] -Eval: %WER 71.90 [ 39639 / 55132, 1550 ins, 23748 del, 14341 sub ] +Dev: %WER 76.60 [ 45103 / 58881, 1239 ins, 24854 del, 19010 sub +Eval: %WER 66.80 [ 36827 / 55132, 1070 ins, 22100 del, 13657 sub ] diff --git a/egs/chime6/s5b_track2/local/decode_ts-vad.sh b/egs/chime6/s5b_track2/local/decode_ts-vad.sh index f410d72d701..606840f456b 100755 --- a/egs/chime6/s5b_track2/local/decode_ts-vad.sh +++ b/egs/chime6/s5b_track2/local/decode_ts-vad.sh @@ -44,6 +44,11 @@ test_sets="dev_${enhancement}_dereverb eval_${enhancement}_dereverb" # ts-vad ts_vad_dir=exp/ts-vad_b ivector_dir=exp/nnet3_b +ups=18 + +#spectral clustering +daffix= +use_sc=true . ./utils/parse_options.sh @@ -51,6 +56,7 @@ ivector_dir=exp/nnet3_b . ./path.sh . ./conf/sad.conf +$use_sc && daffix="_sc" # This script also needs the phonetisaurus g2p, srilm, beamformit #./local/check_tools.sh || exit 1 @@ -159,7 +165,7 @@ if [ $stage -le 3 ]; then hyp_rttm=${test_dir}/rttm.U06 grep 'U06' ${test_dir}/rttm > ${test_dir}/rttm.U06 echo "Array U06 selected for scoring.." - + if $use_new_rttm_reference == "true"; then echo "Use the new RTTM reference." mode="$(cut -d'_' -f1 <<<"$datadir")" @@ -186,7 +192,7 @@ if [ $stage -le 4 ]; then else ref_rttm=data/${datadir}_${nnet_type}_seg/ref_rttm fi - local/diarize.sh --nj $nj --cmd "$train_cmd" --stage $diarizer_stage \ + local/diarize${daffix}.sh --nj $nj --cmd "$train_cmd" --stage $diarizer_stage \ --ref-rttm $ref_rttm \ exp/xvector_nnet_1a \ data/${datadir}_${nnet_type}_seg \ @@ -226,6 +232,7 @@ if [ $stage -le 5 ]; then ivector_affix=it${it}-init it=$((it+1)) local/ts-vad/diarize_TS-VAD_it2.sh --cmd "$train_cmd" \ + --ups $ups \ --ref-rttm $ref_rttm \ --it $it \ --ivector-affix $ivector_affix \ @@ -233,7 +240,7 @@ if [ $stage -le 5 ]; then --audio_dir $audio_dir \ $ts_vad_dir $ivector_dir $initdir \ $ts_vad_dir/it${it}_${ivector_affix} || exit 1 - initdir=$ts_vad_dir/it${it}_${ivector_affix}/${mode}_20ch-AVG_hires_split10000_18ups + initdir=$ts_vad_dir/it${it}_${ivector_affix}/${mode}_20ch-AVG_hires_split10000_${ups}ups done if [ ! -f data/${datadir}_ts-vad-it${ts_vad_num_iters}-diarized_hires/feats.scp ]; then diff --git a/egs/chime6/s5b_track2/local/diarize_sc.sh b/egs/chime6/s5b_track2/local/diarize_sc.sh new file mode 100755 index 00000000000..554f88a3f05 --- /dev/null +++ b/egs/chime6/s5b_track2/local/diarize_sc.sh @@ -0,0 +1,118 @@ +#!/bin/bash +# Copyright 2019 David Snyder +# 2020 Maxim Korenevsky, Ivan Medennikov (STC-innovastions Ltd) +# Apache 2.0. +# +# This script takes an input directory that has a segments file (and +# a feats.scp file), and performs spectral clustering based diarization on it. +# The output directory contains an RTTM file which can be used to resegment the input data. + +stage=0 +nj=10 +cmd="run.pl" +ref_rttm= + +echo "$0 $@" # Print the command line for logging +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 exp/xvector_nnet_1a data/dev exp/dev_diarization" + echo "Options: " + echo " --nj # number of parallel jobs." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --ref-rttm # if present, used to score output RTTM." + exit 1; +fi + +model_dir=$1 +data_in=$2 +out_dir=$3 + +name=`basename $data_in` + +for f in $data_in/feats.scp $data_in/segments $model_dir/plda \ + $model_dir/final.raw $model_dir/extract.config; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +if [ $stage -le 0 ]; then + echo "$0: keeping only data corresponding to array U06 " + echo "$0: we can skip this stage, to perform diarization on all arrays " + # to perform diarization ond scoring on all array please skip this step and + # pass all_array = true in local/multispeaker_score.sh + cp -r data/$name data/${name}.bak + mv data/$name/wav.scp data/$name/wav.scp.bak + grep 'U06' data/$name/wav.scp.bak > data/$name/wav.scp + utils/fix_data_dir.sh data/$name +fi +nj=2 # since we have reduced number of "speakers" now + +if [ $stage -le 1 ]; then + echo "$0: computing features for x-vector extractor" + utils/fix_data_dir.sh data/${name} + rm -rf data/${name}_cmn + local/nnet3/xvector/prepare_feats.sh --nj $nj --cmd "$cmd" \ + data/$name data/${name}_cmn exp/${name}_cmn + cp data/$name/segments exp/${name}_cmn/ + utils/fix_data_dir.sh data/${name}_cmn +fi + +if [ $stage -le 2 ]; then + echo "$0: extracting x-vectors for all segments" + diarization/nnet3/xvector/extract_xvectors.sh --cmd "$cmd" \ + --nj $nj --window 1.5 --period 0.75 --apply-cmn false \ + --min-segment 0.5 $model_dir \ + data/${name}_cmn $out_dir/xvectors_${name} +fi + +# Perform cosine similarity scoring +if [ $stage -le 3 ]; then + # Perform cosine similarity scoring on all pairs of segments for each recording. + echo "$0: performing cosine similarity scoring between all pairs of x-vectors" + local/spectral_clustering/score_cossim.sh --cmd "$cmd" \ + --nj $nj $out_dir/xvectors_${name} \ + $out_dir/xvectors_${name}/cossim_scores +fi + + +if [ $stage -le 4 ]; then + echo "$0: performing spectral clustering using cosine similarity scores (we assume 4 speakers per recording)" + awk '{print $1, "4"}' data/$name/wav.scp > data/$name/reco2num_spk + local/spectral_clustering/scluster.sh --cmd "$cmd" --nj $nj \ + --reco2num-spk data/$name/reco2num_spk \ + --rttm-channel 1 \ + $out_dir/xvectors_${name}/cossim_scores $out_dir + echo "$0: wrote RTTM to output directory ${out_dir}" +fi + +hyp_rttm=${out_dir}/rttm + +# For scoring the diarization system, we use the same tool that was +# used in the DIHARD II challenge. This is available at: +# https://github.com/nryant/dscore +# Note that the scoring takes a single reference RTTM and a single +# hypothesis RTTM. +if [ $stage -le 5 ]; then + # If a reference RTTM file is not provided, we create one using the backed up + # segments and utt2spk files in the original data directory. + if [ -z "$ref_rttm" ]; then + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py data/$name/utt2spk.bak \ + data/$name/segments.bak data/$name/rttm + ref_rttm=data/$name/rttm + fi + echo "Diarization results for "${name} + if ! [ -d dscore ]; then + git clone https://github.com/nryant/dscore.git || exit 1; + cd dscore + python -m pip install --user -r requirements.txt + cd .. + fi + sed 's/_U0[1-6]\.ENH//g' $ref_rttm > $ref_rttm.scoring + sed 's/_U0[1-6]\.ENH//g' $hyp_rttm > $hyp_rttm.scoring + ref_rttm_path=$(readlink -f ${ref_rttm}.scoring) + hyp_rttm_path=$(readlink -f ${hyp_rttm}.scoring) + cat ./local/uem_file | grep 'U06' | sed 's/_U0[1-6]//g' > ./local/uem_file.scoring + cd dscore && python score.py -u ../local/uem_file.scoring -r $ref_rttm_path \ + -s $hyp_rttm_path 2>&1 | tee -a ../${out_dir}/DER && cd .. || exit 1; +fi diff --git a/egs/chime6/s5b_track2/local/spectral_clustering/calc_cossim_scores.py b/egs/chime6/s5b_track2/local/spectral_clustering/calc_cossim_scores.py new file mode 100644 index 00000000000..e3decfd171b --- /dev/null +++ b/egs/chime6/s5b_track2/local/spectral_clustering/calc_cossim_scores.py @@ -0,0 +1,66 @@ +import argparse +import numpy as np +from scipy.spatial.distance import cosine, pdist, squareform +from kaldiio import ReadHelper, WriteHelper + + +def LoadReco2Utt(file): + if ':' in file: + file = file.split(':')[1] + IDs=dict() + with open(file,'r') as f: + for line in f: + ids = line.strip().split() + IDs[ids[0]] = ids[1:] + return IDs + +def ReadXvecs(rspec): + xvecs=dict() + with ReadHelper(rspec) as reader: + for utid, xvec in reader: + xvecs[utid] = xvec + reader.close() + return xvecs + +def Normalize(xvecs_in): + N = len(xvecs_in) + xvec_mean=np.zeros(xvecs_in[0].shape) + for i in range(N): + xvec_mean += xvecs_in[i] + xvec_mean /= N + xvecs = np.copy(xvecs_in) + for i in range(N): + xvecs[i] -= xvec_mean + xvecs[i] = xvecs[i] / np.linalg.norm(xvecs[i]) + return xvecs + +def CalcCosSim(vecs): + return 1 - squareform(pdist(np.asarray(vecs), 'cosine')) + +def WriteDistMatrices(D, wspec): + with WriteHelper(wspec) as writer: + for id in D: + writer(id, D[id]) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Usage: calc_cossim_scores.py \nComputes matrices of the cosine similarity scores between normalized x-vectors for each recording') + parser.add_argument('reco2utt', type=str, help='Kaldi-style rspecifier of recording to segments correspondence') + parser.add_argument('xvec_rspec', type=str, help='Kaldi-style rspecifier of segment xvectors to read') + parser.add_argument('simmat_wspec', type=str, help='Kaldi-style wspecifier of similarity matrices to write') + args = parser.parse_args() + + + print('Computing cosine similarity matrix between ivectors') + print('Parameters:') + print('Reco2Utt rspecifier: {}'.format(args.reco2utt)) + print('Xvectors rspecifier: {}'.format(args.xvec_rspec)) + print('Similarity matrices wspecifier: {}'.format(args.simmat_wspec)) + + IDs = LoadReco2Utt(args.reco2utt) + xvecs_all = ReadXvecs(args.xvec_rspec) + D = dict() + for reco_id in IDs: + xvecs = [ xvecs_all[id] for id in IDs[reco_id] ] + xvecs = Normalize(xvecs) # !!!! Normalize per recording (session) !!!! + D[reco_id] = CalcCosSim(xvecs) + WriteDistMatrices(D, args.simmat_wspec) diff --git a/egs/chime6/s5b_track2/local/spectral_clustering/scluster.sh b/egs/chime6/s5b_track2/local/spectral_clustering/scluster.sh new file mode 100755 index 00000000000..7966626202a --- /dev/null +++ b/egs/chime6/s5b_track2/local/spectral_clustering/scluster.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +# Copyright 2016 David Snyder +# 2017-2018 Matthew Maciejewski +# Apache 2.0. + +# This script performs agglomerative clustering using scored +# pairs of subsegments and produces a rttm file with speaker +# labels derived from the clusters. + +# Begin configuration section. +cmd="run.pl" +stage=0 +nj=10 +cleanup=true +threshold=0.5 +max_spk_fraction=1.0 +first_pass_max_utterances=32767 +rttm_channel=0 +read_costs=false +reco2num_spk= +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 2 ]; then + echo "Usage: $0 " + echo " e.g.: $0 exp/ivectors_callhome exp/ivectors_callhome/results" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --nj # Number of jobs (also see num-processes and num-threads)" + echo " --stage # To control partial reruns" + echo " --threshold # Cluster stopping criterion. Clusters with scores greater" + echo " # than this value will be merged until all clusters" + echo " # exceed this value." + echo " --max-spk-fraction # Clusters with total fraction of utterances greater than" + echo " # this value will not be merged. This is active only when" + echo " # reco2num-spk is supplied and" + echo " # 1.0 / num-spk <= max-spk-fraction <= 1.0." + echo " --first-pass-max-utterances # If the number of utterances is larger than first-pass-max-utterances," + echo " # then clustering is done in two passes. In the first pass, input points" + echo " # are divided into contiguous subsets of size first-pass-max-utterances" + echo " # and each subset is clustered separately. In the second pass, the first" + echo " # pass clusters are merged into the final set of clusters." + echo " --rttm-channel # The value passed into the RTTM channel field. Only affects" + echo " # the format of the RTTM file." + echo " --read-costs # If true, interpret input scores as costs, i.e. similarity" + echo " # is indicated by smaller values. If enabled, clusters will" + echo " # be merged until all cluster scores are less than the" + echo " # threshold value." + echo " --reco2num-spk # File containing mapping of recording ID" + echo " # to number of speakers. Used instead of threshold" + echo " # as stopping criterion if supplied." + echo " --cleanup # If true, remove temporary files" + exit 1; +fi + +srcdir=$1 +dir=$2 + +mkdir -p $dir/tmp + +for f in $srcdir/scores.scp $srcdir/spk2utt $srcdir/utt2spk $srcdir/segments ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +cp $srcdir/spk2utt $dir/tmp/ +cp $srcdir/utt2spk $dir/tmp/ +cp $srcdir/segments $dir/tmp/ +utils/fix_data_dir.sh $dir/tmp > /dev/null + +if [ ! -z $reco2num_spk ]; then + reco2num_spk="ark,t:$reco2num_spk" +fi + +sdata=$dir/tmp/split$nj; +utils/split_data.sh $dir/tmp $nj || exit 1; + +# Set various variables. +mkdir -p $dir/log + +feats="utils/filter_scp.pl $sdata/JOB/spk2utt $srcdir/scores.scp |" +if [ $stage -le 0 ]; then + echo "$0: clustering scores" + for j in `seq $nj`; do + utils/filter_scp.pl $sdata/$j/spk2utt $srcdir/scores.scp > $dir/scores.$j.scp + done + $cmd JOB=1:$nj $dir/log/spectral_cluster.JOB.log \ + python local/spectral_clustering/spec_clust.py \ + --reco2num_spk $reco2num_spk \ + scp:$dir/scores.JOB.scp ark,t:$sdata/JOB/spk2utt ark,t:$dir/labels.JOB || exit 1; +fi + +if [ $stage -le 1 ]; then + echo "$0: combining labels" + for j in $(seq $nj); do cat $dir/labels.$j; done > $dir/labels || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "$0: computing RTTM" + diarization/make_rttm.py --rttm-channel $rttm_channel $srcdir/segments $dir/labels $dir/rttm || exit 1; +fi + +if $cleanup ; then + rm -r $dir/tmp || exit 1; +fi diff --git a/egs/chime6/s5b_track2/local/spectral_clustering/score_cossim.sh b/egs/chime6/s5b_track2/local/spectral_clustering/score_cossim.sh new file mode 100755 index 00000000000..b6ac5dc6549 --- /dev/null +++ b/egs/chime6/s5b_track2/local/spectral_clustering/score_cossim.sh @@ -0,0 +1,81 @@ +#!/bin/bash +# Copyright 2016-2018 David Snyder +# 2017-2018 Matthew Maciejewski +# Apache 2.0. + +# This script is a modified version of diarization/score_plda.sh +# that replaces i-vectors with x-vectors. +# +# This script computes cosine similarity scores from pairs of normalized x-vectors extracted +# from segments of a recording. These scores are in the form of +# affinity matrices, one for each recording. Most likely, the x-vectors +# were computed using diarization/nnet3/xvector/extract_xvectors.sh. +# The affinity matrices are most likely going to be clustered using +# diarization/cluster.sh. + +# Begin configuration section. +cmd="run.pl" +stage=0 +target_energy=0.1 +nj=10 +cleanup=true +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 2 ]; then + echo "Usage: $0 " + echo " e.g.: $0 exp/xvectors_callhome_heldout exp/xvectors_callhome_test exp/xvectors_callhome_test" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --nj # Number of jobs (also see num-processes and num-threads)" + echo " --stage # To control partial reruns" + echo " --cleanup # If true, remove temporary files" + exit 1; +fi + +xvecdir=$1 +dir=$2 + +mkdir -p $dir/tmp + +for f in $xvecdir/xvector.scp $xvecdir/spk2utt $xvecdir/utt2spk $xvecdir/segments; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done +cp $xvecdir/xvector.scp $dir/tmp/feats.scp +cp $xvecdir/spk2utt $dir/tmp/ +cp $xvecdir/utt2spk $dir/tmp/ +cp $xvecdir/segments $dir/tmp/ +cp $xvecdir/spk2utt $dir/ +cp $xvecdir/utt2spk $dir/ +cp $xvecdir/segments $dir/ + +utils/fix_data_dir.sh $dir/tmp > /dev/null + +sdata=$dir/tmp/split$nj; +utils/split_data.sh $dir/tmp $nj || exit 1; + +# Set various variables. +mkdir -p $dir/log + +feats="scp:$sdata/JOB/feats.scp" +if [ $stage -le 0 ]; then + echo "$0: scoring xvectors" + $cmd JOB=1:$nj $dir/log/cossim_scoring.JOB.log \ + python local/spectral_clustering/calc_cossim_scores.py \ + ark:$sdata/JOB/spk2utt "$feats" ark,scp:$dir/scores.JOB.ark,$dir/scores.JOB.scp || exit 1; +fi + +if [ $stage -le 1 ]; then + echo "$0: combining cosine similarity scores across jobs" + for j in $(seq $nj); do cat $dir/scores.$j.scp; done >$dir/scores.scp || exit 1; +fi + +if $cleanup ; then + rm -rf $dir/tmp || exit 1; +fi diff --git a/egs/chime6/s5b_track2/local/spectral_clustering/spec_clust.py b/egs/chime6/s5b_track2/local/spectral_clustering/spec_clust.py new file mode 100644 index 00000000000..e242a41eaef --- /dev/null +++ b/egs/chime6/s5b_track2/local/spectral_clustering/spec_clust.py @@ -0,0 +1,197 @@ +import argparse +import os +import numpy as np +from sklearn.cluster import k_means +from kaldiio import ReadHelper, WriteHelper +import scipy +from sklearn.cluster import SpectralClustering + +''' + Spectral Clustering based on binarization and automatic thresholding + Paper: T.Park, K.Han, M.Kumar, and S.Narayanan, “Auto-tuning spectral clustering for speaker diarization using normalized maximumeigengap”, IEEE Signal Processing Letters, vol. 27, pp. 381–385,2019 +''' + + + +# Input-output routines + +def LoadAffinityMatrix(file): + Matrices=dict() + with ReadHelper(file) as reader: + for key, np_arr in reader: + Matrices[key] = np_arr + return Matrices + +def LoadReco2Utt(file): + if ':' in file: + file = file.split(':')[1] + IDs=dict() + with open(file,'r') as f: + for line in f: + ids = line.strip().split() + IDs[ids[0]] = ids[1:] + return IDs + + +def LoadReco2NumSpk(file): + if ':' in file: + file = file.split(':')[1] + NumSpk=dict() + with open(file,'r') as f: + for line in f: + ids = line.strip().split() + NumSpk[ids[0]] = int(ids[1]) + return NumSpk + +def SaveLabels(IDs, labels, file): + if ':' in file: + file = file.split(':')[1] + with open(file,'w') as f: + for id in IDs: + for i in range(len(IDs[id])): + f.write('{} {}\n'.format(IDs[id][i], labels[id][i]+1)) + +# NME low-level operations + +# Prepares binarized(0/1) affinity matrix with p_neighbors non-zero elements in each row +def get_kneighbors_conn(X_dist, p_neighbors): + X_dist_out = np.zeros_like(X_dist) + for i, line in enumerate(X_dist): + sorted_idx = np.argsort(line) + sorted_idx = sorted_idx[::-1] + indices = sorted_idx[:p_neighbors] + X_dist_out[indices, i] = 1 + return X_dist_out + +# Thresolds affinity matrix to leave p maximum non-zero elements in each row +def Threshold(A, p): + N = A.shape[0] + Ap = np.zeros((N,N)) + for i in range(N): + thr = sorted(A[i,:], reverse=True)[p] + Ap[i,A[i,:]>thr] = A[i,A[i,:]>thr] + return Ap + +# Computes Laplacian of a matrix +def Laplacian(A): + d = np.sum(A, axis=1)-np.diag(A) + D = np.diag(d) + return D - A + +# Calculates eigengaps (differences between adjacent eigenvalues sorted in descending order) +def Eigengap(S): + S = sorted(S) + return np.diff(S) + +# Computes parameters of normalized eigenmaps for automatic thresholding selection +def ComputeNMEParameters(A, p, max_num_clusters): + # p-Neighbour binarization + Ap = get_kneighbors_conn(A, p) + # Symmetrization + Ap = (Ap + np.transpose(Ap))/2 + # Laplacian matrix computation + Lp = Laplacian(Ap) + # EigenValue Decomposition + S, eig_vecs = scipy.linalg.eigh(Lp) + # Eigengap computation + e = Eigengap(S) + g = np.max(e[:max_num_clusters])/(np.max(S)+1e-10) + r = p/g + k = np.argmax(e[:max_num_clusters]) + return (e, g, k, r) + + +''' +Performs spectral clustering with Normalized Maximum Eigengap (NME) +Parameters: + A: affinity matrix (matrix of pairwise cosine similarities or PLDA scores between speaker embeddings) + num_clusters: number of clusters to generate (if None, determined automatically) + max_num_clusters: maximum allowed number of clusters to generate + pmax: maximum count for matrix binarization (should be at least 2) + pbest: best count for matrix binarization (if 0, determined automatically) +Returns: cluster assignments for every speaker embedding +''' +def NME_SpectralClustering(A, num_clusters = None, max_num_clusters = 10, pbest = 0, pmax = 20): + if pbest==0: + print('Selecting best number of neighbors for affinity matrix thresolding:') + rbest = None + kbest = None + for p in range(2, pmax+1): + e, g, k, r = ComputeNMEParameters(A, p, max_num_clusters) + print('p={}, r={}'.format(p,r)) + if rbest is None or rbest > r: + rbest = r + pbest = p + kbest = k + print('Best number of neighbors is {}'.format(pbest)) + return NME_SpectralClustering_sklearn(A, num_clusters if num_clusters is not None else (kbest+1), pbest) + if num_clusters is None: + print('Compute number of clusters to generate:') + e, g, r, k = ComputeNMEParameters(A, p) + print('Number of clusters to generate is {}'.format(k+1)) + return NME_SpectralClustering_sklearn(A, k+1, pbest) + return NME_SpectralClustering_sklearn(A, num_clusters, pbest) + +''' +Performs spectral clustering with Normalized Maximum Eigengap (NME) with fixed threshold and number of clusters +Parameters: + A: affinity matrix (matrix of pairwise cosine similarities or PLDA scores between speaker embeddings) + num_clusters: number of clusters to generate + pbest: best count for matrix binarization +Returns: cluster assignments for every speaker embedding +''' +def NME_SpectralClustering_sklearn(A, num_clusters, pbest): + Ap = Threshold(A, pbest) + Ap = (Ap + np.transpose(Ap)) / 2 + model = SpectralClustering(n_clusters = num_clusters, affinity='precomputed', random_state=0) + labels = model.fit_predict(Ap) + return labels + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Usage: spec_clust.py [options] \n' + + 'Performs spectral clustering of xvectors according to pairwise similarity scores\n' + + 'Auto-selects binarization threshold') + parser.add_argument('simmat_rspec', type=str, help='Kaldi-style rspecifier of similarity scores matrices to read') + parser.add_argument('reco2utt_rspec', type=str, help='Kaldi-style rspecifier of recording-to-utterances correspondence') + parser.add_argument('labels_wspec', type=str, help='Kaldi-style wspecifier to save xvector cluster labels') + parser.add_argument('--max_neighbors', type=int, default=20, help='Maximum number of neighbors to threshold similarity matrix') + parser.add_argument('--reco2num_spk', type=str, default='', help='Kaldi-style rspecifier of recording-to-numofspeakers correspondence') + parser.add_argument('--num_clusters', type=int, default=None, help='Number of clusters to generate. Ignored if --reco2num_spk is given') + args = parser.parse_args() + + assert args.max_neighbors > 1, 'Maximum number of neighpors should be at least 2, {} passed\n'.format(args.max_neighbors) + + print('Spectral clustering of xvector according to precomputed similarity scores matrix') + print('Parameters:') + print('Similarity matrix rspecifier: {}'.format(args.simmat_rspec)) + print('Reco2Utt rspecifier: {}'.format(args.reco2utt_rspec)) + print('Labels wspecifier: {}'.format(args.labels_wspec)) + print('Number of clusters to generate: {}'.format(args.num_clusters)) + print('Maximum number of nighbors to threshold similarity matrix: {}\n'.format(args.max_neighbors)) + print('Reco2NumSpk rspecifier: {}'.format(args.reco2num_spk)) + + print('Loading affinity matrices...', end='') + Matrices = LoadAffinityMatrix(args.simmat_rspec) + print('done') + print('Loading Reco2Utt correspondence...', end='') + IDs = LoadReco2Utt(args.reco2utt_rspec) + print('done') + + if args.reco2num_spk != '': + NumSpk = LoadReco2NumSpk(args.reco2num_spk) + + Labels = dict() + for id in IDs: + A = Matrices[id] + IDList = IDs[id] + + num_clusters = args.num_clusters if args.reco2num_spk == '' else NumSpk[id] + assert num_clusters is None or num_clusters > 0, 'Positive number of clusters expected for {}, {} found\n'.format(id, num_clusters) + + print('Start clustering for recording {}...'.format(id)) + Labels[id] = NME_SpectralClustering(A, num_clusters = num_clusters, pmax = args.max_neighbors) + print('Clustering done') + print( 'Saving labels...') + SaveLabels(IDs, Labels, args.labels_wspec) + print('done') diff --git a/egs/chime6/s5b_track2/run.sh b/egs/chime6/s5b_track2/run.sh index 55e1bd3e1e1..126ef574a05 100755 --- a/egs/chime6/s5b_track2/run.sh +++ b/egs/chime6/s5b_track2/run.sh @@ -37,8 +37,6 @@ ivector_dir=exp/nnet3_b . ./cmd.sh . ./path.sh -[ ! -f $ts_vad_archive ] && wget - if [ $decode_only == "true" ]; then stage=18 fi @@ -296,7 +294,7 @@ fi ########################################################################## # DECODING: In track 2, we are given raw utterances without segment # or speaker information, so we have to decode the whole pipeline, i.e., -# SAD -> Diarization (x-vectors clustering) -> TS-VAD Diarization -> ASR. +# SAD -> Diarization (x-vectors + Spectral Clustering) -> TS-VAD Diarization -> ASR. # This is done in the local/decode_ts-vad.sh script. ########################################################################## if [ $stage -le 18 ]; then From 7385efa720dbf1c34c80ebbdd9bef3d3506f44ee Mon Sep 17 00:00:00 2001 From: medennikov Date: Mon, 15 Jun 2020 13:26:46 +0300 Subject: [PATCH 05/10] Small fix --- egs/chime6/s5b_track2/local/decode_ts-vad.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/chime6/s5b_track2/local/decode_ts-vad.sh b/egs/chime6/s5b_track2/local/decode_ts-vad.sh index 606840f456b..a4d41ec6af2 100755 --- a/egs/chime6/s5b_track2/local/decode_ts-vad.sh +++ b/egs/chime6/s5b_track2/local/decode_ts-vad.sh @@ -58,7 +58,7 @@ use_sc=true $use_sc && daffix="_sc" # This script also needs the phonetisaurus g2p, srilm, beamformit -#./local/check_tools.sh || exit 1 +./local/check_tools.sh || exit 1 ########################################################################### # We first generate the synchronized audio files across arrays and From d84baed428ef548cd2c42f7f64a1f4c60f32e7ab Mon Sep 17 00:00:00 2001 From: medennikov Date: Mon, 15 Jun 2020 17:39:43 +0300 Subject: [PATCH 06/10] Spectral clustering speed-up --- .../s5b_track2/local/spectral_clustering/spec_clust.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/egs/chime6/s5b_track2/local/spectral_clustering/spec_clust.py b/egs/chime6/s5b_track2/local/spectral_clustering/spec_clust.py index e242a41eaef..74bed84e238 100644 --- a/egs/chime6/s5b_track2/local/spectral_clustering/spec_clust.py +++ b/egs/chime6/s5b_track2/local/spectral_clustering/spec_clust.py @@ -91,11 +91,17 @@ def ComputeNMEParameters(A, p, max_num_clusters): Ap = (Ap + np.transpose(Ap))/2 # Laplacian matrix computation Lp = Laplacian(Ap) + N = Lp.shape[0] # EigenValue Decomposition - S, eig_vecs = scipy.linalg.eigh(Lp) + # Get max_num_clusters+1 lowest eigenvalues sorted in ascending order + S, _ = scipy.sparse.linalg.eigsh(-Lp, k=max_num_clusters+1, which='LA') + S = -S[::-1] + # Get largest eigenvalue + Smax, _ = scipy.sparse.linalg.eigsh(Lp, k=1, which='LA') # Eigengap computation e = Eigengap(S) - g = np.max(e[:max_num_clusters])/(np.max(S)+1e-10) +# g = np.max(e[:max_num_clusters])/(np.max(S)+1e-10) + g = np.max(e[:max_num_clusters])/(Smax[0]+1e-10) r = p/g k = np.argmax(e[:max_num_clusters]) return (e, g, k, r) From ba62a52215b1ddfaee6cd73f8028865a0c629985 Mon Sep 17 00:00:00 2001 From: medennikov Date: Tue, 16 Jun 2020 00:42:45 +0300 Subject: [PATCH 07/10] Move general spectral clustering scripts to the diarization directory --- .../v1/diarization}/calc_cossim_scores.py | 5 +++++ .../v1/diarization}/scluster.sh | 5 +++-- .../v1/diarization}/score_cossim.sh | 5 +++-- .../v1/diarization}/spec_clust.py | 9 ++++++--- egs/chime6/s5b_track2/local/diarize_sc.sh | 4 ++-- 5 files changed, 19 insertions(+), 9 deletions(-) rename egs/{chime6/s5b_track2/local/spectral_clustering => callhome_diarization/v1/diarization}/calc_cossim_scores.py (96%) rename egs/{chime6/s5b_track2/local/spectral_clustering => callhome_diarization/v1/diarization}/scluster.sh (96%) rename egs/{chime6/s5b_track2/local/spectral_clustering => callhome_diarization/v1/diarization}/score_cossim.sh (94%) rename egs/{chime6/s5b_track2/local/spectral_clustering => callhome_diarization/v1/diarization}/spec_clust.py (96%) diff --git a/egs/chime6/s5b_track2/local/spectral_clustering/calc_cossim_scores.py b/egs/callhome_diarization/v1/diarization/calc_cossim_scores.py similarity index 96% rename from egs/chime6/s5b_track2/local/spectral_clustering/calc_cossim_scores.py rename to egs/callhome_diarization/v1/diarization/calc_cossim_scores.py index e3decfd171b..98ae683f606 100644 --- a/egs/chime6/s5b_track2/local/spectral_clustering/calc_cossim_scores.py +++ b/egs/callhome_diarization/v1/diarization/calc_cossim_scores.py @@ -1,3 +1,8 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Maxim Korenevsky (STC-innovations Ltd) +# Apache 2.0. + import argparse import numpy as np from scipy.spatial.distance import cosine, pdist, squareform diff --git a/egs/chime6/s5b_track2/local/spectral_clustering/scluster.sh b/egs/callhome_diarization/v1/diarization/scluster.sh similarity index 96% rename from egs/chime6/s5b_track2/local/spectral_clustering/scluster.sh rename to egs/callhome_diarization/v1/diarization/scluster.sh index 7966626202a..f9aba5df37b 100755 --- a/egs/chime6/s5b_track2/local/spectral_clustering/scluster.sh +++ b/egs/callhome_diarization/v1/diarization/scluster.sh @@ -2,9 +2,10 @@ # Copyright 2016 David Snyder # 2017-2018 Matthew Maciejewski +# 2020 Maxim Korenevsky (STC-innovations Ltd) # Apache 2.0. -# This script performs agglomerative clustering using scored +# This script performs spectral clustering using scored # pairs of subsegments and produces a rttm file with speaker # labels derived from the clusters. @@ -91,7 +92,7 @@ if [ $stage -le 0 ]; then utils/filter_scp.pl $sdata/$j/spk2utt $srcdir/scores.scp > $dir/scores.$j.scp done $cmd JOB=1:$nj $dir/log/spectral_cluster.JOB.log \ - python local/spectral_clustering/spec_clust.py \ + python diarization/spec_clust.py \ --reco2num_spk $reco2num_spk \ scp:$dir/scores.JOB.scp ark,t:$sdata/JOB/spk2utt ark,t:$dir/labels.JOB || exit 1; fi diff --git a/egs/chime6/s5b_track2/local/spectral_clustering/score_cossim.sh b/egs/callhome_diarization/v1/diarization/score_cossim.sh similarity index 94% rename from egs/chime6/s5b_track2/local/spectral_clustering/score_cossim.sh rename to egs/callhome_diarization/v1/diarization/score_cossim.sh index b6ac5dc6549..6b37d6a5d1a 100755 --- a/egs/chime6/s5b_track2/local/spectral_clustering/score_cossim.sh +++ b/egs/callhome_diarization/v1/diarization/score_cossim.sh @@ -1,6 +1,7 @@ #!/bin/bash # Copyright 2016-2018 David Snyder # 2017-2018 Matthew Maciejewski +# 2020 Maxim Korenevsky (STC-innovations Ltd) # Apache 2.0. # This script is a modified version of diarization/score_plda.sh @@ -11,7 +12,7 @@ # affinity matrices, one for each recording. Most likely, the x-vectors # were computed using diarization/nnet3/xvector/extract_xvectors.sh. # The affinity matrices are most likely going to be clustered using -# diarization/cluster.sh. +# diarization/scluster.sh. # Begin configuration section. cmd="run.pl" @@ -67,7 +68,7 @@ feats="scp:$sdata/JOB/feats.scp" if [ $stage -le 0 ]; then echo "$0: scoring xvectors" $cmd JOB=1:$nj $dir/log/cossim_scoring.JOB.log \ - python local/spectral_clustering/calc_cossim_scores.py \ + python diarization/calc_cossim_scores.py \ ark:$sdata/JOB/spk2utt "$feats" ark,scp:$dir/scores.JOB.ark,$dir/scores.JOB.scp || exit 1; fi diff --git a/egs/chime6/s5b_track2/local/spectral_clustering/spec_clust.py b/egs/callhome_diarization/v1/diarization/spec_clust.py similarity index 96% rename from egs/chime6/s5b_track2/local/spectral_clustering/spec_clust.py rename to egs/callhome_diarization/v1/diarization/spec_clust.py index 74bed84e238..2bf771c6132 100644 --- a/egs/chime6/s5b_track2/local/spectral_clustering/spec_clust.py +++ b/egs/callhome_diarization/v1/diarization/spec_clust.py @@ -1,3 +1,8 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Maxim Korenevsky (STC-innovations Ltd) +# Apache 2.0. + import argparse import os import numpy as np @@ -8,11 +13,9 @@ ''' Spectral Clustering based on binarization and automatic thresholding - Paper: T.Park, K.Han, M.Kumar, and S.Narayanan, “Auto-tuning spectral clustering for speaker diarization using normalized maximumeigengap”, IEEE Signal Processing Letters, vol. 27, pp. 381–385,2019 + Paper: T.Park, K.Han, M.Kumar, and S.Narayanan, Auto-tuning spectral clustering for speaker diarization using normalized maximumeigengap, IEEE Signal Processing Letters, vol. 27, pp. 381-385,2019 ''' - - # Input-output routines def LoadAffinityMatrix(file): diff --git a/egs/chime6/s5b_track2/local/diarize_sc.sh b/egs/chime6/s5b_track2/local/diarize_sc.sh index 554f88a3f05..1c594f04e0a 100755 --- a/egs/chime6/s5b_track2/local/diarize_sc.sh +++ b/egs/chime6/s5b_track2/local/diarize_sc.sh @@ -70,7 +70,7 @@ fi if [ $stage -le 3 ]; then # Perform cosine similarity scoring on all pairs of segments for each recording. echo "$0: performing cosine similarity scoring between all pairs of x-vectors" - local/spectral_clustering/score_cossim.sh --cmd "$cmd" \ + diarization/score_cossim.sh --cmd "$cmd" \ --nj $nj $out_dir/xvectors_${name} \ $out_dir/xvectors_${name}/cossim_scores fi @@ -79,7 +79,7 @@ fi if [ $stage -le 4 ]; then echo "$0: performing spectral clustering using cosine similarity scores (we assume 4 speakers per recording)" awk '{print $1, "4"}' data/$name/wav.scp > data/$name/reco2num_spk - local/spectral_clustering/scluster.sh --cmd "$cmd" --nj $nj \ + diarization/scluster.sh --cmd "$cmd" --nj $nj \ --reco2num-spk data/$name/reco2num_spk \ --rttm-channel 1 \ $out_dir/xvectors_${name}/cossim_scores $out_dir From 83db1859194e36df015e1146ed991e155cd6b859 Mon Sep 17 00:00:00 2001 From: medennikov Date: Tue, 16 Jun 2020 10:47:46 +0300 Subject: [PATCH 08/10] Remove redundant arguments in diarization/scluster.sh --- .../v1/diarization/scluster.sh | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/egs/callhome_diarization/v1/diarization/scluster.sh b/egs/callhome_diarization/v1/diarization/scluster.sh index f9aba5df37b..33a373a282d 100755 --- a/egs/callhome_diarization/v1/diarization/scluster.sh +++ b/egs/callhome_diarization/v1/diarization/scluster.sh @@ -14,11 +14,7 @@ cmd="run.pl" stage=0 nj=10 cleanup=true -threshold=0.5 -max_spk_fraction=1.0 -first_pass_max_utterances=32767 rttm_channel=0 -read_costs=false reco2num_spk= # End configuration section. @@ -36,24 +32,8 @@ if [ $# != 2 ]; then echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." echo " --nj # Number of jobs (also see num-processes and num-threads)" echo " --stage # To control partial reruns" - echo " --threshold # Cluster stopping criterion. Clusters with scores greater" - echo " # than this value will be merged until all clusters" - echo " # exceed this value." - echo " --max-spk-fraction # Clusters with total fraction of utterances greater than" - echo " # this value will not be merged. This is active only when" - echo " # reco2num-spk is supplied and" - echo " # 1.0 / num-spk <= max-spk-fraction <= 1.0." - echo " --first-pass-max-utterances # If the number of utterances is larger than first-pass-max-utterances," - echo " # then clustering is done in two passes. In the first pass, input points" - echo " # are divided into contiguous subsets of size first-pass-max-utterances" - echo " # and each subset is clustered separately. In the second pass, the first" - echo " # pass clusters are merged into the final set of clusters." echo " --rttm-channel # The value passed into the RTTM channel field. Only affects" echo " # the format of the RTTM file." - echo " --read-costs # If true, interpret input scores as costs, i.e. similarity" - echo " # is indicated by smaller values. If enabled, clusters will" - echo " # be merged until all cluster scores are less than the" - echo " # threshold value." echo " --reco2num-spk # File containing mapping of recording ID" echo " # to number of speakers. Used instead of threshold" echo " # as stopping criterion if supplied." From cf4308a78290aceaebb9380f5ac468ac6a4e992d Mon Sep 17 00:00:00 2001 From: medennikov Date: Tue, 16 Jun 2020 16:01:14 +0300 Subject: [PATCH 09/10] Add GSS on TS-VAD diarized segments --- .../v1/diarization/scluster.sh | 8 +- egs/chime6/s5b_track2/RESULTS | 10 +- egs/chime6/s5b_track2/local/decode_ts-vad.sh | 92 +++++++++++++++++-- egs/chime6/s5b_track2/local/diarize_sc.sh | 2 +- .../s5b_track2/local/get_cache_chime6.sh | 53 +++++++++++ .../s5b_track2/local/prepare_gss_data.sh | 37 ++++++++ egs/chime6/s5b_track2/local/run_gss.sh | 1 + .../local/ts-vad/compute_ts-vad_weights.sh | 2 +- .../local/ts-vad/convert_prob_to_rttm.py | 2 +- .../local/ts-vad/diarize_TS-VAD_it1.sh | 2 +- .../local/ts-vad/diarize_TS-VAD_it2.sh | 1 - .../local/ts-vad/modify_ups_utt2spk.pl | 6 +- .../local/ts-vad/split_feats_seg.pl | 14 +-- .../s5b_track2/local/ts-vad/vad_prob_mod.py | 2 +- egs/chime6/s5b_track2/run.sh | 3 +- 15 files changed, 204 insertions(+), 31 deletions(-) create mode 100755 egs/chime6/s5b_track2/local/get_cache_chime6.sh create mode 100755 egs/chime6/s5b_track2/local/prepare_gss_data.sh create mode 120000 egs/chime6/s5b_track2/local/run_gss.sh diff --git a/egs/callhome_diarization/v1/diarization/scluster.sh b/egs/callhome_diarization/v1/diarization/scluster.sh index 33a373a282d..51300315149 100755 --- a/egs/callhome_diarization/v1/diarization/scluster.sh +++ b/egs/callhome_diarization/v1/diarization/scluster.sh @@ -44,6 +44,11 @@ fi srcdir=$1 dir=$2 +reco2num_spk_opts= +if [ ! $reco2num_spk == "" ]; then + reco2num_spk_opts="--reco2num-spk $reco2num_spk" +fi + mkdir -p $dir/tmp for f in $srcdir/scores.scp $srcdir/spk2utt $srcdir/utt2spk $srcdir/segments ; do @@ -72,8 +77,7 @@ if [ $stage -le 0 ]; then utils/filter_scp.pl $sdata/$j/spk2utt $srcdir/scores.scp > $dir/scores.$j.scp done $cmd JOB=1:$nj $dir/log/spectral_cluster.JOB.log \ - python diarization/spec_clust.py \ - --reco2num_spk $reco2num_spk \ + python diarization/spec_clust.py $reco2num_spk_opts \ scp:$dir/scores.JOB.scp ark,t:$sdata/JOB/spk2utt ark,t:$dir/labels.JOB || exit 1; fi diff --git a/egs/chime6/s5b_track2/RESULTS b/egs/chime6/s5b_track2/RESULTS index abb5f845b87..944a19b6f13 100644 --- a/egs/chime6/s5b_track2/RESULTS +++ b/egs/chime6/s5b_track2/RESULTS @@ -11,14 +11,14 @@ Dev (new RTTM) 1.9 0.7 2.6 Eval (old RTTM) 4.1 1.8 5.9 Eval (new RTTM) 4.3 1.5 5.8 -# Diarization +# Diarization (x-vectors + AHC) DER JER Dev (old RTTM) 61.56 69.75 Dev (new RTTM) 63.42 70.83 Eval (old RTTM) 61.96 71.40 Eval (new RTTM) 68.20 72.54 -# Diarization (Spectral Clustering) +# Diarization (x-vectors + Spectral Clustering) DER JER Dev (new RTTM) 59.03 61.94 Eval (new RTTM) 64.67 63.36 @@ -28,6 +28,6 @@ Eval (new RTTM) 64.67 63.36 Dev (new RTTM) 45.29 52.24 Eval (new RTTM) 41.33 44.83 -# ASR nnet3 tdnn+chain -Dev: %WER 76.60 [ 45103 / 58881, 1239 ins, 24854 del, 19010 sub -Eval: %WER 66.80 [ 36827 / 55132, 1070 ins, 22100 del, 13657 sub ] +# ASR nnet3 tdnn+chain (GSS on TS-VAD segments) +Dev: %WER 67.50 [ 39745 / 58881, 1524 ins, 21250 del, 16971 sub ] +Eval: %WER 60.23 [ 33204 / 55132, 869 ins, 21582 del, 10753 sub ] diff --git a/egs/chime6/s5b_track2/local/decode_ts-vad.sh b/egs/chime6/s5b_track2/local/decode_ts-vad.sh index a4d41ec6af2..f8fb0eb1fc6 100755 --- a/egs/chime6/s5b_track2/local/decode_ts-vad.sh +++ b/egs/chime6/s5b_track2/local/decode_ts-vad.sh @@ -1,11 +1,11 @@ #!/usr/bin/env bash # # This script decodes raw utterances through the entire pipeline: -# Feature extraction -> SAD -> Diarization -> TS-VAD diarization -> ASR +# Feature extraction -> SAD -> Diarization -> TS-VAD diarization -> GSS enhancement -> ASR # # Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal) # 2019 Desh Raj, David Snyder, Ashish Arora, Zhaoheng Ni -# 2020 Ivan Medennikov +# 2020 Ivan Medennikov, Tatyana Prisyach, Maxim Korenevsky (STC-innovations Ltd) # Apache 2.0 # Begin configuration section. @@ -50,6 +50,18 @@ ups=18 daffix= use_sc=true +# gss +final_gss=true +gss_nj=40 +bss_iterations=5 +context_samples=160000 + +#number of microphones to perform GSS: outer_array_mics (CH1 and CH4 of each Kinect) or True (all microphones) +multiarray=outer_array_mics + +#GSS activities: hard (standard binary activities) or soft (TS-VAD derived activities, not implemented yet) +gss_type=hard + . ./utils/parse_options.sh . ./cmd.sh @@ -57,6 +69,8 @@ use_sc=true . ./conf/sad.conf $use_sc && daffix="_sc" +pref_enhan=_${multiarray}_${context_samples}_${bss_iterations}it + # This script also needs the phonetisaurus g2p, srilm, beamformit ./local/check_tools.sh || exit 1 @@ -252,29 +266,89 @@ if [ $stage -le 5 ]; then fi ####################################################################### -# Decode diarized output using trained chain model +# GSS on top of TS-VAD diarized segments ####################################################################### if [ $stage -le 6 ]; then + if $final_gss; then + if [ ! -d pb_chime5/ ]; then + local/install_pb_chime5.sh + fi + echo "$0: enhance data..." + # Guided Source Separation (GSS) from Paderborn University + # http://spandh.dcs.shef.ac.uk/chime_workshop/papers/CHiME_2018_paper_boeddecker.pdf + # @Article{PB2018CHiME5, + # author = {Boeddeker, Christoph and Heitkaemper, Jens and Schmalenstroeer, Joerg and Drude, Lukas and Heymann, Jahn and Haeb-Umbach, Reinhold}, + # title = {{Front-End Processing for the CHiME-5 Dinner Party Scenario}}, + # year = {2018}, + # booktitle = {CHiME5 Workshop}, + # } + + miniconda_dir=$HOME/miniconda3/ + export PATH=$miniconda_dir/bin:$PATH + export CHIME6_DIR=$chime6_corpus + + for dset in ${test_sets}; do + datadir=data/${dset}_ts-vad-it${ts_vad_num_iters}-diarized + dset_type=`echo $dset | awk -F "_" '{print $1;}'` + [ ! -f ${datadir}_hires/chime6.json ] && local/get_cache_chime6.sh ${datadir}_hires/segments $dset_type $audio_dir/$dset_type ${datadir}_hires/chime6.json + [ ! -d pb_chime5/cache ] && mkdir pb_chime5/cache + cp -f ${datadir}_hires/chime6.json pb_chime5/cache/chime6.json + + enhanced_dir=data/gss_${gss_type}${pref_enhan}_ts-vad-it${ts_vad_num_iters}-diarized + if [ ! -f ${enhanced_dir}/.${dset_type}.done ]; then + local/run_gss.sh \ + --cmd "$train_cmd --max-jobs-run $gss_nj" --nj 160 \ + --bss_iterations $bss_iterations \ + --context_samples $context_samples \ + --multiarray $multiarray \ + ${dset_type} \ + ${enhanced_dir} \ + ${enhanced_dir} || exit 1 + touch ${enhanced_dir}/.${dset_type}.done + fi + + if [ ! -f data/${datadir}_gss_${gss_type}${pref_enhan}_hires/feats.scp ]; then + local/prepare_gss_data.sh ${enhanced_dir}/audio/${dset_type} ${datadir}_hires ${datadir}_gss_${gss_type}${pref_enhan}_hires + fi + done + fi +fi + +####################################################################### +# Decode diarized output using trained chain model +####################################################################### +if [ $stage -le 7 ]; then for datadir in ${test_sets}; do + dset=data/${datadir}_ts-vad-it${ts_vad_num_iters}-diarized + if $final_gss; then + dset=${dset}_gss_${gss_type}${pref_enhan} + fi echo "$0 performing decoding on the extracted features" asr_model_dir=exp/chain_${train_set}_cleaned_rvb local/nnet3/decode.sh --affix 2stage --acwt 1.0 --post-decode-acwt 10.0 \ --frames-per-chunk 150 --nj $nj --ivector-dir exp/nnet3_${train_set}_cleaned_rvb \ - data/${datadir}_ts-vad-it${ts_vad_num_iters}-diarized data/lang $asr_model_dir/tree_sp/graph $asr_model_dir/tdnn1b_sp/ || exit 1 + $dset data/lang $asr_model_dir/tree_sp/graph $asr_model_dir/tdnn1b_sp/ || exit 1 done fi ####################################################################### # Score decoded dev/eval sets ####################################################################### -if [ $stage -le 7 ]; then +if [ $stage -le 8 ]; then # final scoring to get the challenge result # please specify both dev and eval set directories so that the search parameters # (insertion penalty and language model weight) will be tuned using the dev set + dev_dir=dev_beamformit_dereverb_ts-vad-it${ts_vad_num_iters}-diarized + eval_dir=eval_beamformit_dereverb_ts-vad-it${ts_vad_num_iters}-diarized + if $final_gss; then + dev_dir=${dev_dir}_gss_${gss_type}${pref_enhan} + eval_dir=${eval_dir}_gss_${gss_type}${pref_enhan} + fi local/score_for_submit.sh --stage $score_stage \ - --dev_decodedir exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_dereverb_ts-vad-it${ts_vad_num_iters}-diarized_2stage \ - --dev_datadir dev_beamformit_dereverb_ts-vad-it${ts_vad_num_iters}-diarized_hires \ - --eval_decodedir exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_eval_beamformit_dereverb_ts-vad-it${ts_vad_num_iters}-diarized_2stage \ - --eval_datadir eval_beamformit_dereverb_ts-vad-it${ts_vad_num_iters}-diarized_hires + --dev_decodedir exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_${dev_dir}_2stage \ + --dev_datadir ${dev_dir}_hires \ + --eval_decodedir exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_${eval_dir}_2stage \ + --eval_datadir ${eval_dir}_hires fi + exit 0; diff --git a/egs/chime6/s5b_track2/local/diarize_sc.sh b/egs/chime6/s5b_track2/local/diarize_sc.sh index 1c594f04e0a..247bead2d35 100755 --- a/egs/chime6/s5b_track2/local/diarize_sc.sh +++ b/egs/chime6/s5b_track2/local/diarize_sc.sh @@ -1,6 +1,6 @@ #!/bin/bash # Copyright 2019 David Snyder -# 2020 Maxim Korenevsky, Ivan Medennikov (STC-innovastions Ltd) +# 2020 Maxim Korenevsky, Ivan Medennikov (STC-innovations Ltd) # Apache 2.0. # # This script takes an input directory that has a segments file (and diff --git a/egs/chime6/s5b_track2/local/get_cache_chime6.sh b/egs/chime6/s5b_track2/local/get_cache_chime6.sh new file mode 100755 index 00000000000..f4bfd6de3f0 --- /dev/null +++ b/egs/chime6/s5b_track2/local/get_cache_chime6.sh @@ -0,0 +1,53 @@ +#!/bin/bash -u + +# Copyright 2020 Prisyach Tatyana (STC-innovations Ltd) +# Apache 2.0. + +segments=$1 +dset=$2 +dir_chime6=$3 +json=$4 + +if [ $dset == "dev" ]; then + awk -F "_|-| " -v dir_chime6=$dir_chime6 -v S02="false" -v S09="false" ' + BEGIN { printf"%s","{\n \"alias\": {\n \"dev\": [\n \"S02\",\n \"S09\"\n ]\n },\n \"datasets\": {\n"; } + {if (($1 == "S02" && S02 == "true") || ($1 == "S09" && S09 == "true")) { printf"%s",",\n"; } + else if ($1 == "S09" && S09 == "false") { printf"%s","\n },\n"; } + if ($1 == "S02") { + if (S02 == "false") { printf"%s"," \"S02\": {\n"; S02="true"; } + if ($3 == "1") { spk="P05"; } + else if ($3 == "2") { spk="P06"; } + else if ($3 == "3") { spk="P07"; } + else { spk="P08"; } + printf"%s"," \"" $1"_"$2"-"$3"-"$4"-"$5 "\": {\n \"audio_path\": {\n \"observation\": {\n \"U01\": [\n \""dir_chime6"/S02_U01.CH1.wav\",\n \""dir_chime6"/S02_U01.CH2.wav\",\n \""dir_chime6"/S02_U01.CH3.wav\",\n \""dir_chime6"/S02_U01.CH4.wav\"\n ],\n \"U02\": [\n \""dir_chime6"/S02_U02.CH1.wav\",\n \""dir_chime6"/S02_U02.CH2.wav\",\n \""dir_chime6"/S02_U02.CH3.wav\",\n \""dir_chime6"/S02_U02.CH4.wav\"\n ],\n \"U03\": [\n \""dir_chime6"/S02_U03.CH1.wav\",\n \""dir_chime6"/S02_U03.CH2.wav\",\n \""dir_chime6"/S02_U03.CH3.wav\",\n \""dir_chime6"/S02_U03.CH4.wav\"\n ],\n \"U04\": [\n \""dir_chime6"/S02_U04.CH1.wav\",\n \""dir_chime6"/S02_U04.CH2.wav\",\n \""dir_chime6"/S02_U04.CH3.wav\",\n \""dir_chime6"/S02_U04.CH4.wav\"\n ],\n \"U05\": [\n \""dir_chime6"/S02_U05.CH1.wav\",\n \""dir_chime6"/S02_U05.CH2.wav\",\n \""dir_chime6"/S02_U05.CH3.wav\",\n \""dir_chime6"/S02_U05.CH4.wav\"\n ],\n \"U06\": [\n \""dir_chime6"/S02_U06.CH1.wav\",\n \""dir_chime6"/S02_U06.CH2.wav\",\n \""dir_chime6"/S02_U06.CH3.wav\",\n \""dir_chime6"/S02_U06.CH4.wav\"\n ]\n },\n \"worn\": {\n \"P05\":\n \""dir_chime6"/S02_P05.wav\",\n \"P06\":\n \""dir_chime6"/S02_P06.wav\",\n \"P07\":\n \""dir_chime6"/S02_P07.wav\",\n \"P08\":\n \""dir_chime6"/S02_P08.wav\"\n }\n },\n \"end\": "int($9*16000)",\n \"gender\": \""unk"\",\n \"location\": \""unk"\",\n \"notes\": [],\n \"num_samples\": "int($9*16000-$8*16000)",\n \"reference_array\": \""unk"\",\n \"session_id\": \"S02\",\n \"speaker_id\": \""spk"\",\n \"start\": "int($8*16000)",\n \"transcription\": \"\"\n }"; + } else if ($1 == "S09") { + if (S09 == "false") { printf"%s"," \"S09\": {\n"; S09="true"; } + if ($3 == "1") { spk="P25"; } + else if ($3 == "2") { spk="P26"; } + else if ($3 == "3") { spk="P27"; } + else { spk="P28"; } + printf"%s"," \"" $1"_"$2"-"$3"-"$4"-"$5 "\": {\n \"audio_path\": {\n \"observation\": {\n \"U01\": [\n \""dir_chime6"/S09_U01.CH1.wav\",\n \""dir_chime6"/S09_U01.CH2.wav\",\n \""dir_chime6"/S09_U01.CH3.wav\",\n \""dir_chime6"/S09_U01.CH4.wav\"\n ],\n \"U02\": [\n \""dir_chime6"/S09_U02.CH1.wav\",\n \""dir_chime6"/S09_U02.CH2.wav\",\n \""dir_chime6"/S09_U02.CH3.wav\",\n \""dir_chime6"/S09_U02.CH4.wav\"\n ],\n \"U03\": [\n \""dir_chime6"/S09_U03.CH1.wav\",\n \""dir_chime6"/S09_U03.CH2.wav\",\n \""dir_chime6"/S09_U03.CH3.wav\",\n \""dir_chime6"/S09_U03.CH4.wav\"\n ],\n \"U04\": [\n \""dir_chime6"/S09_U04.CH1.wav\",\n \""dir_chime6"/S09_U04.CH2.wav\",\n \""dir_chime6"/S09_U04.CH3.wav\",\n \""dir_chime6"/S09_U04.CH4.wav\"\n ],\n \"U06\": [\n \""dir_chime6"/S09_U06.CH1.wav\",\n \""dir_chime6"/S09_U06.CH2.wav\",\n \""dir_chime6"/S09_U06.CH3.wav\",\n \""dir_chime6"/S09_U06.CH4.wav\"\n ]\n },\n \"worn\": {\n \"P25\":\n \""dir_chime6"/S09_P25.wav\",\n \"P26\":\n \""dir_chime6"/S09_P26.wav\",\n \"P27\":\n \""dir_chime6"/S09_P27.wav\",\n \"P28\":\n \""dir_chime6"/S09_P28.wav\"\n }\n },\n \"end\": "int($9*16000)",\n \"gender\": \""unk"\",\n \"location\": \""unk"\",\n \"notes\": [],\n \"num_samples\": "int($9*16000-$8*16000)",\n \"reference_array\": \""unk"\",\n \"session_id\": \"S09\",\n \"speaker_id\": \""spk"\",\n \"start\": "int($8*16000)",\n \"transcription\": \"\"\n }";} + } + END { printf"%s","\n }\n }\n}\n"; }' $segments > $json +elif [ $dset == "eval" ]; then + awk -F "_|-| " -v dir_chime6=$dir_chime6 -v S01="false" -v S21="false" ' + BEGIN { printf"%s","{\n \"alias\": {\n \"eval\": [\n \"S01\",\n \"S21\"\n ]\n },\n \"datasets\": {\n"; } + {if (($1 == "S01" && S01 == "true") || ($1 == "S21" && S21 == "true")) { printf"%s",",\n"; } + else if ($1 == "S21" && S21 == "false") { printf"%s","\n },\n"; } + if ($1 == "S01"){ + if (S01 == "false") { printf"%s"," \"S01\": {\n"; S01="true"; } + if ($3 == "1") { spk="P01"; } + else if ($3 == "2") { spk="P02"; } + else if ($3 == "3") { spk="P03"; } + else { spk="P04"; } + printf"%s"," \"" $1"_"$2"-"$3"-"$4"-"$5 "\": {\n \"audio_path\": {\n \"observation\": {\n \"U01\": [\n \""dir_chime6"/S01_U01.CH1.wav\",\n \""dir_chime6"/S01_U01.CH2.wav\",\n \""dir_chime6"/S01_U01.CH3.wav\",\n \""dir_chime6"/S01_U01.CH4.wav\"\n ],\n \"U02\": [\n \""dir_chime6"/S01_U02.CH1.wav\",\n \""dir_chime6"/S01_U02.CH2.wav\",\n \""dir_chime6"/S01_U02.CH3.wav\",\n \""dir_chime6"/S01_U02.CH4.wav\"\n ],\n \"U04\": [\n \""dir_chime6"/S01_U04.CH1.wav\",\n \""dir_chime6"/S01_U04.CH2.wav\",\n \""dir_chime6"/S01_U04.CH3.wav\",\n \""dir_chime6"/S01_U04.CH4.wav\"\n ],\n \"U05\": [\n \""dir_chime6"/S01_U05.CH1.wav\",\n \""dir_chime6"/S01_U05.CH2.wav\",\n \""dir_chime6"/S01_U05.CH3.wav\",\n \""dir_chime6"/S01_U05.CH4.wav\"\n ],\n \"U06\": [\n \""dir_chime6"/S01_U06.CH1.wav\",\n \""dir_chime6"/S01_U06.CH2.wav\",\n \""dir_chime6"/S01_U06.CH3.wav\",\n \""dir_chime6"/S01_U06.CH4.wav\"\n ]\n }\n },\n \"end\": "int($9*16000)",\n \"gender\": \""unk"\",\n \"location\": \""unk"\",\n \"notes\": [],\n \"num_samples\": "int($9*16000-$8*16000)",\n \"reference_array\": \""unk"\",\n \"session_id\": \"S01\",\n \"speaker_id\": \""spk"\",\n \"start\": "int($8*16000)",\n \"transcription\": \"\"\n }"; + } else if ($1 == "S21") { + if (S21 == "false") { printf"%s"," \"S21\": {\n"; S21="true"; } + if ($3 == "1") { spk="P45"; } + else if ($3 == "2") { spk="P46"; } + else if ($3 == "3") { spk="P47"; } + else { spk="P48"; } + printf"%s"," \"" $1"_"$2"-"$3"-"$4"-"$5 "\": {\n \"audio_path\": {\n \"observation\": {\n \"U01\": [\n \""dir_chime6"/S21_U01.CH1.wav\",\n \""dir_chime6"/S21_U01.CH2.wav\",\n \""dir_chime6"/S21_U01.CH3.wav\",\n \""dir_chime6"/S21_U01.CH4.wav\"\n ],\n \"U02\": [\n \""dir_chime6"/S21_U02.CH1.wav\",\n \""dir_chime6"/S21_U02.CH2.wav\",\n \""dir_chime6"/S21_U02.CH3.wav\",\n \""dir_chime6"/S21_U02.CH4.wav\"\n ],\n \"U03\": [\n \""dir_chime6"/S21_U03.CH1.wav\",\n \""dir_chime6"/S21_U03.CH2.wav\",\n \""dir_chime6"/S21_U03.CH3.wav\",\n \""dir_chime6"/S21_U03.CH4.wav\"\n ],\n \"U04\": [\n \""dir_chime6"/S21_U04.CH1.wav\",\n \""dir_chime6"/S21_U04.CH2.wav\",\n \""dir_chime6"/S21_U04.CH3.wav\",\n \""dir_chime6"/S21_U04.CH4.wav\"\n ],\n \"U05\": [\n \""dir_chime6"/S21_U05.CH1.wav\",\n \""dir_chime6"/S21_U05.CH2.wav\",\n \""dir_chime6"/S21_U05.CH3.wav\",\n \""dir_chime6"/S21_U05.CH4.wav\"\n ],\n \"U06\": [\n \""dir_chime6"/S21_U06.CH1.wav\",\n \""dir_chime6"/S21_U06.CH2.wav\",\n \""dir_chime6"/S21_U06.CH3.wav\",\n \""dir_chime6"/S21_U06.CH4.wav\"\n ]\n }\n },\n \"end\": "int($9*16000)",\n \"gender\": \""unk"\",\n \"location\": \""unk"\",\n \"notes\": [],\n \"num_samples\": "int($9*16000-$8*16000)",\n \"reference_array\": \""unk"\",\n \"session_id\": \"S21\",\n \"speaker_id\": \""spk"\",\n \"start\": "int($8*16000)",\n \"transcription\": \"\"\n }";} + } + END { printf"%s","\n }\n }\n}\n"; }' $segments > $json +fi diff --git a/egs/chime6/s5b_track2/local/prepare_gss_data.sh b/egs/chime6/s5b_track2/local/prepare_gss_data.sh new file mode 100755 index 00000000000..78f8af91dd8 --- /dev/null +++ b/egs/chime6/s5b_track2/local/prepare_gss_data.sh @@ -0,0 +1,37 @@ +#!/bin/bash -u +# Copyright 2020 Prisyach Tatyana (STC-innovations Ltd) + +. ./utils/parse_options.sh +. ./path.sh +. ./cmd.sh + +echo >&2 "$0" "$@" +if [ $# -ne 3 ] ; then + echo >&2 "$0" "$@" + echo >&2 "$0: Error: wrong number of arguments" + echo -e >&2 "Usage:\n $0 " + echo -e >&2 "eg:\n $0 enhanced_dir/audio/dev data/dev_gss" + exit 1 +fi + +nj=8 + +gss_dir=$1 +src_dir=$2 +dir=$3 + +wav_list=$(find -L $gss_dir -name "*.wav" -printf "%f") +if [ ! -d $dir ]; then + mkdir ${dir} +fi +echo $wav_list | awk -F ".wav" -v gss_dir=$gss_dir '{for (i=1; i<=NF; i++) {print $i" "gss_dir"/"$i".wav";}}' > $dir/wav_temp.scp +sort -u $dir/wav_temp.scp > $dir/wav.scp +rm -f $dir/wav_temp.scp +cp $src_dir/utt2spk $dir/utt2spk + +utils/fix_data_dir.sh $dir + +echo "$0 extracting mfcc freatures using segments file" +steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd "$decode_cmd" ${dir} +steps/compute_cmvn_stats.sh ${dir} +cp $src_dir/text ${dir}/text diff --git a/egs/chime6/s5b_track2/local/run_gss.sh b/egs/chime6/s5b_track2/local/run_gss.sh new file mode 120000 index 00000000000..1711fb3f821 --- /dev/null +++ b/egs/chime6/s5b_track2/local/run_gss.sh @@ -0,0 +1 @@ +../../s5_track1/local/run_gss.sh \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/ts-vad/compute_ts-vad_weights.sh b/egs/chime6/s5b_track2/local/ts-vad/compute_ts-vad_weights.sh index 425b90e92ee..9aeb40d83df 100755 --- a/egs/chime6/s5b_track2/local/ts-vad/compute_ts-vad_weights.sh +++ b/egs/chime6/s5b_track2/local/ts-vad/compute_ts-vad_weights.sh @@ -4,7 +4,7 @@ # 2015 Vijayaditya Peddinti # 2016 Vimal Manohar # 2017 Pegah Ghahremani -# 2020 Ivan Medennikov +# 2020 Ivan Medennikov (STC-innovations Ltd) # Apache 2.0 # Computes training alignments using nnet3 DNN, with output to lattices. diff --git a/egs/chime6/s5b_track2/local/ts-vad/convert_prob_to_rttm.py b/egs/chime6/s5b_track2/local/ts-vad/convert_prob_to_rttm.py index e95f36c9734..0934f8d2a81 100644 --- a/egs/chime6/s5b_track2/local/ts-vad/convert_prob_to_rttm.py +++ b/egs/chime6/s5b_track2/local/ts-vad/convert_prob_to_rttm.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright 2020 Yuri Khokhlov, Ivan Medennikov +# Copyright 2020 Yuri Khokhlov, Ivan Medennikov (STC-innovations Ltd) # Apache 2.0. """This script converts TS-VAD output probabilities to a NIST RTTM file. diff --git a/egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it1.sh b/egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it1.sh index fa80e9fb03b..277a1ebaa4d 100755 --- a/egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it1.sh +++ b/egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it1.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2020 Ivan Medennikov +# Copyright 2020 Ivan Medennikov (STC-innovations Ltd) # Apache 2.0. # diff --git a/egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it2.sh b/egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it2.sh index 6d99428181a..15f328c206e 100755 --- a/egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it2.sh +++ b/egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it2.sh @@ -1,5 +1,4 @@ #!/bin/bash -#!/bin/bash # Copyright 2020 Ivan Medennikov # Apache 2.0. diff --git a/egs/chime6/s5b_track2/local/ts-vad/modify_ups_utt2spk.pl b/egs/chime6/s5b_track2/local/ts-vad/modify_ups_utt2spk.pl index 9deb0dbacf4..5c220332c9b 100755 --- a/egs/chime6/s5b_track2/local/ts-vad/modify_ups_utt2spk.pl +++ b/egs/chime6/s5b_track2/local/ts-vad/modify_ups_utt2spk.pl @@ -1,9 +1,11 @@ #!/usr/bin/perl +# Copyright 2020 Ivan Medennikov (STC-innovations Ltd) +# Apache 2.0. ($filein,$ups,$fileout)=@ARGV; -open(fidin, "<$filein") or die "cant open $filein : $!"; -open(fidout, ">$fileout") or die "cant open $fileout : $!"; +open(fidin, "<$filein") or die "can't open $filein : $!"; +open(fidout, ">$fileout") or die "can't open $fileout : $!"; %utt2spk={}; %spk2utt={}; while ($line=) diff --git a/egs/chime6/s5b_track2/local/ts-vad/split_feats_seg.pl b/egs/chime6/s5b_track2/local/ts-vad/split_feats_seg.pl index 9f1ce0342a8..cdcf472bfc1 100755 --- a/egs/chime6/s5b_track2/local/ts-vad/split_feats_seg.pl +++ b/egs/chime6/s5b_track2/local/ts-vad/split_feats_seg.pl @@ -1,9 +1,11 @@ #!/usr/bin/perl +# Copyright 2020 Ivan Medennikov (STC-innovations Ltd) +# Apache 2.0. ($filein,$utt2spk,$utt2dur,$chunk,$fileout,$fileout2,$fileout3)=@ARGV; %utt2dur={}; -open(fidin, "<$utt2dur") or die "cant open $utt2dur : $!"; +open(fidin, "<$utt2dur") or die "can't open $utt2dur : $!"; while ($line=) { $line=~s/\s+$//; @@ -13,7 +15,7 @@ close(fidin); %utt2spk={}; -open(fidin, "<$utt2spk") or die "cant open $utt2spk : $!"; +open(fidin, "<$utt2spk") or die "can't open $utt2spk : $!"; while ($line=) { $line=~s/\s+$//; @@ -23,10 +25,10 @@ close(fidin); -open(fidin, "<$filein") or die "cant open $filein : $!"; -open(fidout, ">$fileout") or die "cant open $fileout : $!"; -open(fidout2, ">$fileout2") or die "cant open $fileout2 : $!"; -open(fidout3, ">$fileout3") or die "cant open $fileout3 : $!"; +open(fidin, "<$filein") or die "can't open $filein : $!"; +open(fidout, ">$fileout") or die "can't open $fileout : $!"; +open(fidout2, ">$fileout2") or die "can't open $fileout2 : $!"; +open(fidout3, ">$fileout3") or die "can't open $fileout3 : $!"; while ($line=) { diff --git a/egs/chime6/s5b_track2/local/ts-vad/vad_prob_mod.py b/egs/chime6/s5b_track2/local/ts-vad/vad_prob_mod.py index 38fe94a383e..a6b88833e6f 100644 --- a/egs/chime6/s5b_track2/local/ts-vad/vad_prob_mod.py +++ b/egs/chime6/s5b_track2/local/ts-vad/vad_prob_mod.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright 2020 Ivan Medennikov +# Copyright 2020 Ivan Medennikov (STC-innovations Ltd) # Apache 2.0. """This script modifies TS-VAD output probabilities applying diff --git a/egs/chime6/s5b_track2/run.sh b/egs/chime6/s5b_track2/run.sh index 126ef574a05..4bbd9c59d58 100755 --- a/egs/chime6/s5b_track2/run.sh +++ b/egs/chime6/s5b_track2/run.sh @@ -294,7 +294,8 @@ fi ########################################################################## # DECODING: In track 2, we are given raw utterances without segment # or speaker information, so we have to decode the whole pipeline, i.e., -# SAD -> Diarization (x-vectors + Spectral Clustering) -> TS-VAD Diarization -> ASR. +# SAD -> Diarization (x-vectors + Spectral Clustering) -> TS-VAD Diarization +# GSS -> ASR. # This is done in the local/decode_ts-vad.sh script. ########################################################################## if [ $stage -le 18 ]; then From fe4740e9234217ba869c6c2a9dd67259cab6e615 Mon Sep 17 00:00:00 2001 From: medennikov Date: Thu, 25 Jun 2020 14:55:23 +0300 Subject: [PATCH 10/10] add TS-VAD training script and ts-vad_1a model trained using this script; update RESULTS --- egs/chime6/s5b_track2/RESULTS | 28 +- egs/chime6/s5b_track2/local/decode_ts-vad.sh | 11 +- .../s5b_track2/local/get_cache_chime6.py | 87 ++++ egs/chime6/s5b_track2/local/train_ts-vad.sh | 422 ++++++++++++++++++ .../s5b_track2/local/ts-vad/apply_map.pl | 98 ++++ .../local/ts-vad/compute_ts-vad_weights.sh | 2 +- .../local/ts-vad/conv_ali_to_vad_012.py | 63 +++ .../local/ts-vad/conv_vad_to_dense_targets.py | 336 ++++++++++++++ .../local/ts-vad/extract_ivectors.sh | 296 ++++++++++++ .../local/ts-vad/make_json_align.py | 83 ++++ .../local/ts-vad/make_negative_utt2spk.pl | 155 +++++++ .../s5b_track2/local/ts-vad/make_utt2uniq.pl | 51 +++ .../local/ts-vad/modify_ups_utt2spk.pl | 3 + .../local/ts-vad/prepare_json_weights.pl | 57 +++ .../local/ts-vad/shuffle_4spk_scp_utt2spk.pl | 151 +++++++ egs/chime6/s5b_track2/run.sh | 35 +- src/featbin/Makefile | 2 +- src/featbin/multiply-vectors.cc | 173 +++++++ src/featbin/paste-vectors.cc | 138 ++++++ 19 files changed, 2163 insertions(+), 28 deletions(-) create mode 100755 egs/chime6/s5b_track2/local/get_cache_chime6.py create mode 100755 egs/chime6/s5b_track2/local/train_ts-vad.sh create mode 100755 egs/chime6/s5b_track2/local/ts-vad/apply_map.pl create mode 100644 egs/chime6/s5b_track2/local/ts-vad/conv_ali_to_vad_012.py create mode 100644 egs/chime6/s5b_track2/local/ts-vad/conv_vad_to_dense_targets.py create mode 100755 egs/chime6/s5b_track2/local/ts-vad/extract_ivectors.sh create mode 100755 egs/chime6/s5b_track2/local/ts-vad/make_json_align.py create mode 100755 egs/chime6/s5b_track2/local/ts-vad/make_negative_utt2spk.pl create mode 100755 egs/chime6/s5b_track2/local/ts-vad/make_utt2uniq.pl create mode 100755 egs/chime6/s5b_track2/local/ts-vad/prepare_json_weights.pl create mode 100755 egs/chime6/s5b_track2/local/ts-vad/shuffle_4spk_scp_utt2spk.pl create mode 100644 src/featbin/multiply-vectors.cc create mode 100644 src/featbin/paste-vectors.cc diff --git a/egs/chime6/s5b_track2/RESULTS b/egs/chime6/s5b_track2/RESULTS index 944a19b6f13..da0078e7122 100644 --- a/egs/chime6/s5b_track2/RESULTS +++ b/egs/chime6/s5b_track2/RESULTS @@ -6,10 +6,10 @@ # Speech Activity Detection (SAD) Missed speech False alarm Total error -Dev (old RTTM) 2.5 0.8 3.3 -Dev (new RTTM) 1.9 0.7 2.6 -Eval (old RTTM) 4.1 1.8 5.9 -Eval (new RTTM) 4.3 1.5 5.8 +Dev (old RTTM) 2.5 0.8 3.3 +Dev (new RTTM) 1.9 0.7 2.6 +Eval (old RTTM) 4.1 1.8 5.9 +Eval (new RTTM) 4.3 1.5 5.8 # Diarization (x-vectors + AHC) DER JER @@ -18,16 +18,20 @@ Dev (new RTTM) 63.42 70.83 Eval (old RTTM) 61.96 71.40 Eval (new RTTM) 68.20 72.54 -# Diarization (x-vectors + Spectral Clustering) +# Diarization (x-vectors + Spectral Clustering), new RTTM DER JER -Dev (new RTTM) 59.03 61.94 -Eval (new RTTM) 64.67 63.36 +Dev 59.03 61.94 +Eval 64.67 63.36 -# Diarization (3 iterations of TS-VAD) +# Diarization (3 iterations of TS-VAD), new RTTM DER JER -Dev (new RTTM) 45.29 52.24 -Eval (new RTTM) 41.33 44.83 +Dev it1 48.82 55.52 +Dev it2 46.16 51.90 +Dev it3 44.88 50.48 +Eval it1 46.08 51.23 +Eval it2 42.90 47.24 +Eval it3 42.08 46.48 # ASR nnet3 tdnn+chain (GSS on TS-VAD segments) -Dev: %WER 67.50 [ 39745 / 58881, 1524 ins, 21250 del, 16971 sub ] -Eval: %WER 60.23 [ 33204 / 55132, 869 ins, 21582 del, 10753 sub ] +Dev: %WER 66.33 [ 39055 / 58881, 2641 ins, 20923 del, 15491 sub ] +Eval: %WER 60.03 [ 33098 / 55132, 1298 ins, 19428 del, 12372 sub ] diff --git a/egs/chime6/s5b_track2/local/decode_ts-vad.sh b/egs/chime6/s5b_track2/local/decode_ts-vad.sh index f8fb0eb1fc6..5ed51e8962f 100755 --- a/egs/chime6/s5b_track2/local/decode_ts-vad.sh +++ b/egs/chime6/s5b_track2/local/decode_ts-vad.sh @@ -237,6 +237,7 @@ if [ $stage -le 5 ]; then local/ts-vad/diarize_TS-VAD_it1.sh --cmd "$train_cmd" \ --ref-rttm $ref_rttm \ --ivector-affix $ivector_affix \ + --thr 0.4 \ $ts_vad_dir $ivector_dir ${datadir}_diarized \ $ts_vad_dir/it${it}_${ivector_affix} || exit 1 @@ -245,6 +246,9 @@ if [ $stage -le 5 ]; then while [ $it -lt $ts_vad_num_iters ]; do ivector_affix=it${it}-init it=$((it+1)) + mt=0.5 + t=0.5 + [ $it == "2" ] && mt=0 && t=0.5 local/ts-vad/diarize_TS-VAD_it2.sh --cmd "$train_cmd" \ --ups $ups \ --ref-rttm $ref_rttm \ @@ -252,6 +256,9 @@ if [ $stage -le 5 ]; then --ivector-affix $ivector_affix \ --channels "CH1 CH2 CH3 CH4" \ --audio_dir $audio_dir \ + --mt $mt \ + --t $t \ + --thr 0.4 \ $ts_vad_dir $ivector_dir $initdir \ $ts_vad_dir/it${it}_${ivector_affix} || exit 1 initdir=$ts_vad_dir/it${it}_${ivector_affix}/${mode}_20ch-AVG_hires_split10000_${ups}ups @@ -290,14 +297,14 @@ if [ $stage -le 6 ]; then for dset in ${test_sets}; do datadir=data/${dset}_ts-vad-it${ts_vad_num_iters}-diarized dset_type=`echo $dset | awk -F "_" '{print $1;}'` - [ ! -f ${datadir}_hires/chime6.json ] && local/get_cache_chime6.sh ${datadir}_hires/segments $dset_type $audio_dir/$dset_type ${datadir}_hires/chime6.json + [ ! -f ${datadir}_hires/chime6.json ] && python3 local/get_cache_chime6.py ${datadir}_hires/segments $dset_type $audio_dir/$dset_type ${datadir}_hires/chime6.json [ ! -d pb_chime5/cache ] && mkdir pb_chime5/cache cp -f ${datadir}_hires/chime6.json pb_chime5/cache/chime6.json enhanced_dir=data/gss_${gss_type}${pref_enhan}_ts-vad-it${ts_vad_num_iters}-diarized if [ ! -f ${enhanced_dir}/.${dset_type}.done ]; then local/run_gss.sh \ - --cmd "$train_cmd --max-jobs-run $gss_nj" --nj 160 \ + --cmd "$train_cmd --max-jobs-run $gss_nj" --nj 512 \ --bss_iterations $bss_iterations \ --context_samples $context_samples \ --multiarray $multiarray \ diff --git a/egs/chime6/s5b_track2/local/get_cache_chime6.py b/egs/chime6/s5b_track2/local/get_cache_chime6.py new file mode 100755 index 00000000000..5b822fb6fb5 --- /dev/null +++ b/egs/chime6/s5b_track2/local/get_cache_chime6.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Prisyach Tatyana (STC-innovations Ltd) +# Apache 2.0. + +import json +import argparse +import sys +import re + +def get_args(): + parser = argparse.ArgumentParser( + """This script creates chime6.json from ts-vad segments for dev and eval in the format required by pb_chime5 + e.g. {} segments \\ + dev \\ + CHiME6/audio/dev \\ + pb_chime5/cache/chime6.json""".format(sys.argv[0])) + + parser.add_argument("segments", help="""ts-vad segments""") + parser.add_argument("dset_type", help="""dataset name (dev or eval)""") + parser.add_argument("dir_chime6", help="""chime6 data directory to dev or eval""") + parser.add_argument("json", help="""path to chime6.json""") + args = parser.parse_args() + return args + +def create_main_fields(dset_type): + if dset_type == "dev": + to_json = { "alias": {"dev":["S02","S09"]}, "datasets": {"S02": {}, "S09": {}}} + elif dset_type == "eval": + to_json = { "alias": {"eval":["S01","S21"]}, "datasets": {"S01": {}, "S21": {}}} + return to_json + +def create_utt_field(dset_type, utt_name, ses, spk, time_start, time_end, fd, dir_chime6): + if dset_type == "dev": + if ses == "S02": + dic_spk = ["P05", "P06", "P07", "P08"] + kinects = ["U01", "U02", "U03", "U04", "U05", "U06"] + elif ses == "S09": + dic_spk = ["P25", "P26", "P27", "P28"] + kinects = ["U01", "U02", "U03", "U04", "U06"] + elif dset_type == "eval": + if ses == "S01": + dic_spk = ["P01", "P02", "P03", "P04"] + kinects = ["U01", "U02", "U04", "U05", "U06"] + elif ses == "S21": + dic_spk = ["P45", "P46", "P47", "P48"] + kinects = ["U01", "U02", "U03", "U04", "U05", "U06"] + + start = int(time_start * fd) + end = int(time_end * fd) + to_json = {"audio_path": {"observation": " "}, "end": end, "gender": " ", "location": " ", "notes": [], "num_samples": " ", "reference_array": " ", "session_id": " ", "speaker_id": " ", "start": start, "transcription": " "} + to_json["num_samples"] = end - start + to_json["session_id"] = ses + to_json["speaker_id"] = dic_spk[spk-1] + + channels = 4 + to_json["audio_path"]["observation"] = {} + for kinect in kinects: + kinect_ch = [] + for ch in range(channels): + kinect_ch.append(dir_chime6 + "/" + ses + "_" + kinect + "." + "CH" + str(ch+1) + ".wav") + to_json["audio_path"]["observation"][kinect] = kinect_ch + return to_json + +def main(): + args = get_args() + + fd = 16000 + + print("dset_type=", args.dset_type) + json_chime6 = create_main_fields(args.dset_type) + + utt_list = open(args.segments).readlines() + + f_json = open(args.json, 'w') + + for utt in utt_list: + utt_name, wav, time_start, time_end = utt.split(None) + ses, kinect, spk, start, end = re.split('_|-', utt_name) + utt_field = create_utt_field(args.dset_type, utt_name, ses, int(spk), float(time_start), float(time_end), fd, args.dir_chime6) + json_chime6["datasets"][ses][utt_name] = utt_field + + f_json.write(json.dumps(json_chime6, indent=4)) + +if __name__ == '__main__': + main() + diff --git a/egs/chime6/s5b_track2/local/train_ts-vad.sh b/egs/chime6/s5b_track2/local/train_ts-vad.sh new file mode 100755 index 00000000000..d16dc790412 --- /dev/null +++ b/egs/chime6/s5b_track2/local/train_ts-vad.sh @@ -0,0 +1,422 @@ +#!/bin/bash +# Copyright 2020 Ivan Medennikov (STC-innovations Ltd) +# Apache 2.0 + +# This script trains TS-VAD model using the same training data +# as in the baseline acoustic model. + +. ./path.sh +. ./cmd.sh + +# Begin configuration section. +stage=0 +train_stage=-10 +srand=0 + +# Training options +num_epochs=2 +lrate=0003 +l2=0.002 +l2o=0.001 +common_egs_dir= +remove_egs=true + +lang=data/lang +silphonelist=1:2:3:4:5:21:22:23:24:25 +spnphonelist= + +sa=60 #number of seconds to sub-split speakers +basedata=train_worn_simu_u400k_cleaned_sp +srcdata=${basedata}_${sa}s +data=${srcdata}_hires +lats=${PWD}/exp/tri3_cleaned_ali_${basedata} +nnet3_affix=_train_worn_simu_u400k_cleaned_rvb +affix=1a + +tardir=$lats/VAD_targets +targets=$tardir/dense-4H/dense_targets.scp +ivector_dir=${PWD}/exp/nnet3${nnet3_affix} +nj_ivec=128 +nj_paste=48 +dir=exp/ts-vad_$affix + +chime6_corpus=${PWD}/CHiME6 +json_dir=${chime6_corpus}/transcriptions +json_ali=${PWD}/data/json_ali +sess_list="S03 S04 S05 S06 S07 S08 S12 S13 S16 S17 S18 S19 S20 S22 S23 S24" +sess_num=16 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +mdl=$lats/final.mdl +[ ! -f $mdl ] && echo "$0: expected model file $mdl to exist!" && exit 1; +ivdir=$ivector_dir/ivectors-offline_${data} +iv4dir=$ivector_dir/ivectors-offline-4spk_${data} + +if [ $stage -le 0 ]; then + if [ ! -f data/${srcdata}_hires/.done ]; then + echo "Splitting speakers in ${basedata} into ${sa}-second subspeakers" + utils/data/modify_speaker_info.sh --seconds-per-spk-max $sa data/${basedata}_hires data/${srcdata}_hires + touch data/${srcdata}_hires/.done + fi +fi + +if [ $stage -le 1 ]; then + outdir=$tardir + nj=$(cat $lats/num_jobs) || exit 1; + if [ -f $lats/ali.1.gz ]; then + if [ ! -f $outdir/.done ]; then + echo "Preparing per-utterance 1-speaker VAD targets from alignment" + $train_cmd JOB=1:$nj $outdir/log/ali_to_phones.JOB.log \ + gunzip -c $lats/ali.JOB.gz \| \ + ali-to-phones --frame-shift=0.01 --per-frame=true ${mdl} ark:- ark,t:$outdir/ali_phones.JOB.ark || exit 1; + $train_cmd JOB=1:$nj $outdir/log/conv_ali_to_vad.JOB.log \ + python3 local/ts-vad/conv_ali_to_vad_012.py "$silphonelist" "$spnphonelist" $outdir/ali_phones.JOB.ark $outdir/ali_vad_targets.JOB.ark || exit 1 + cat $outdir/ali_vad_targets.*.ark | sort > $outdir/ali_vad_targets.ark + if [ ! -f $outdir/targets.ark ]; then + vali_dst="ark,scp:$outdir/targets.ark,$outdir/targets.scp" + copy-int-vector "ark:$outdir/ali_vad_targets.ark" "$vali_dst" || exit 1 + fi + touch $outdir/.done + fi + fi +fi + +if [ $stage -le 2 ]; then + if [ ! -f $json_ali/.done ]; then + echo "Converting JSON to per-session VAD alignment (overlapped speech is considered as silence, to exclude these regions from i-vectors estimation)" + mkdir -p $json_ali + for json in `find $json_dir/ -name "*.json"`; do + sess=$(basename $json | sed s:.json::) + echo $sess + $train_cmd $json_ali/${sess}.log \ + python local/ts-vad/make_json_align.py $json ark,t,scp:$json_ali/$sess.ark,$json_ali/${sess}.scp || exit 1; + $train_cmd $json_ali/${sess}_sp0.9.log \ + python local/ts-vad/make_json_align.py --frame_shift 0.009 $json ark,t,scp:$json_ali/${sess}_sp0.9.ark,$json_ali/${sess}_sp0.9.scp || exit 1; + sed -i s:\ :_sp0.9\ : $json_ali/${sess}_sp0.9.scp + $train_cmd $json_ali/${sess}_sp1.1.log \ + python local/ts-vad/make_json_align.py --frame_shift 0.011 $json ark,t,scp:$json_ali/${sess}_sp1.1.ark,$json_ali/${sess}_sp1.1.scp || exit 1; + sed -i s:\ :_sp1.1\ : $json_ali/${sess}_sp1.1.scp + done + cat $json_ali/*.scp > $json_ali/all_sess.scp + touch $json_ali/.done + fi +fi + +if [ $stage -le 3 ]; then + ivdata=${srcdata}_hires + outdir=$ivdir + if [ ! -f $outdir/.lats-weights.done ]; then + echo 'Preparing weights for i-vectors extraction from ali/lats' + silence_weight=0.00001 + silphonelist=$(cat $lang/phones/silence.csl) || exit 1; + acwt=0.1 + if [ ! -f $lats/final.mdl ]; then + echo "$0: expected $lats/final.mdl to exist." + exit 1; + fi + if [ -f $lats/ali.1.gz ]; then + nj_orig=$(cat $lats/num_jobs) || exit 1; + rm $outdir/weights.*.gz 2>/dev/null + $train_cmd JOB=1:$nj_orig $outdir/log/ali_to_post.JOB.log \ + gunzip -c $lats/ali.JOB.gz \| \ + ali-to-post ark:- ark:- \| \ + weight-silence-post $silence_weight $silphonelist $lats/final.mdl ark:- ark:- \| \ + post-to-weights ark:- "ark,t:|gzip -c >$outdir/weights.JOB.gz" || exit 1; + for j in $(seq $nj_orig); do gunzip -c $outdir/weights.$j.gz; done | gzip -c >$outdir/weights_lats.gz || exit 1; + rm $outdir/weights.*.gz || exit 1; + elif [ -f $lats/lat.1.gz ]; then + rm $outdir/weights.*.gz 2>/dev/null + $train_cmd JOB=1:$nj_orig $outdir/log/lat_to_post.JOB.log \ + lattice-best-path --acoustic-scale=$acwt "ark:gunzip -c $lats/lat.JOB.gz|" ark:/dev/null ark:- \| \ + ali-to-post ark:- ark:- \| \ + weight-silence-post $silence_weight $silphonelist $lats/final.mdl ark:- ark:- \| \ + post-to-weights ark:- "ark:|gzip -c >$outdir/weights.JOB.gz" || exit 1; + for j in $(seq $nj_orig); do gunzip -c $outdir/weights.$j.gz; done | gzip -c >$outdir/weights_lats.gz || exit 1; + rm $outdir/weights.*.gz || exit 1; + else + echo "$0: expected ali.1.gz or lat.1.gz to exist in $lats"; + exit 1; + fi + touch $outdir/.lats-weights.done + fi + if [ ! -f $outdir/.json-weights.done ]; then + echo 'Preparing weights for i-vectors extraction from json' + perl local/ts-vad/prepare_json_weights.pl data/$ivdata/segments $json_ali/all_sess.scp $outdir/weights_json.scp || exit 1; + touch $outdir/.json-weights.done + fi + if [ ! -f $outdir/.mult-weights.done ]; then + echo 'Multiplying weights from lats and json' + $train_cmd $outdir/multiply-vectors.log \ + multiply-vectors --length-tolerance=2 ark:"gunzip -c $outdir/weights_lats.gz |" scp:$outdir/weights_json.scp ark,t:"| gzip -c >$outdir/weights_mult.gz" || exit 1; + touch $outdir/.mult-weights.done + fi + if [ ! -f $outdir/.done ]; then + echo 'Preparing single-speaker offline i-vectors' + local/ts-vad/extract_ivectors.sh --cmd $train_cmd --nj $nj_ivec \ + --sub-speaker-frames 0 --max-count 100 \ + data/$ivdata $lang $ivector_dir/extractor $outdir/weights_mult.gz $outdir || exit 1; + touch $outdir/.done + fi +fi + +if [ $stage -le 4 ]; then + outdir=$iv4dir + if [ ! -f $outdir/.done ]; then + mkdir -p $outdir + echo 'Preparing 4-speaker i-vectors' + if [ ! -f data/${srcdata}_hires/utt2spk_cl3 ]; then + echo 'Creating 3 negative utt2spk files with speakers from the same session' + local/ts-vad/make_negative_utt2spk.pl data/${srcdata}_hires/utt2spk \ + data/${srcdata}_hires/utt2spk_cl1 data/${srcdata}_hires/utt2spk_cl2 data/${srcdata}_hires/utt2spk_cl3 || exit 1; + fi + + cat $ivdir/ivectors_spk.*.ark > $outdir/ivectors_spk.ark + $train_cmd JOB=1:3 $outdir/log/apply-map.JOB.log \ + local/ts-vad/apply_map.pl --permissive -f 2 $outdir/ivectors_spk.ark \ $outdir/ivector_online.scp + touch $outdir/.done + fi +fi + +if [ $stage -le 5 ]; then + outdir=$(dirname $targets) + mkdir -p $outdir + tmp=$(dirname $outdir) + mkdir -p $tmp/tmp_sess + nj=$(cat $lats/num_jobs) || exit 1; + if [ ! -f $outdir/.done ]; then + echo 'Creating 8-dimensional dense targets for TS-VAD training' + [ ! -f $tmp/ali_vad_targets_wk.ark ] && grep -v "rev" $tmp/ali_vad_targets.ark > $tmp/ali_vad_targets_wk.ark + [ ! -f $tmp/segments_wk_ali ] && utils/filter_scp.pl $tmp/ali_vad_targets_wk.ark data/${srcdata}_hires/segments > $tmp/segments_wk_ali + [ ! -f data/${srcdata}_hires/utt2num_frames ] && feat-to-len scp:data/${srcdata}_hires/feats.scp ark,t:data/${srcdata}_hires/utt2num_frames + + for p in `seq 4`; do + [ ! -f $tmp/utt2spk_shuf${p} ] && cat data/${srcdata}_hires/utt2spk_shuf${p} | sort > $tmp/utt2spk_shuf${p} + done + + nj_dense=$((sess_num*3)) + j=0 + for sess in $sess_list; do + jp=$((3*j+1)) + for p in `seq 4`; do + [ ! -f $tmp/tmp_sess/utt2spk_shuf${p}.$jp ] && grep "$sess" $tmp/utt2spk_shuf${p} | grep -v "sp" > $tmp/tmp_sess/utt2spk_shuf${p}.$jp + done + [ ! -f $tmp/tmp_sess/ali_vad_targets_wk.$jp.ark ] && grep "$sess" $tardir/ali_vad_targets_wk.ark | grep -v "sp" > $tmp/tmp_sess/ali_vad_targets_wk.$jp.ark + + jp=$((3*j+2)) + for p in `seq 4`; do + [ ! -f $tmp/tmp_sess/utt2spk_shuf${p}.$jp ] && grep "$sess" $tmp/utt2spk_shuf${p} | grep "sp0.9" > $tmp/tmp_sess/utt2spk_shuf${p}.$jp + done + [ ! -f $tmp/tmp_sess/ali_vad_targets_wk.$jp.ark ] && grep "$sess" $tardir/ali_vad_targets_wk.ark | grep "sp0.9" > $tmp/tmp_sess/ali_vad_targets_wk.$jp.ark + + jp=$((3*j+3)) + for p in `seq 4`; do + [ ! -f $tmp/tmp_sess/utt2spk_shuf${p}.$jp ] && grep "$sess" $tmp/utt2spk_shuf${p} | grep "sp1.1" > $tmp/tmp_sess/utt2spk_shuf${p}.$jp + done + [ ! -f $tmp/tmp_sess/ali_vad_targets_wk.$jp.ark ] && grep "$sess" $tardir/ali_vad_targets_wk.ark | grep "sp1.1" > $tmp/tmp_sess/ali_vad_targets_wk.$jp.ark + j=$((j+1)) + done + + $train_cmd JOB=1:$nj_dense $outdir/log/prepare_targets.JOB.log \ + python3 local/ts-vad/conv_vad_to_dense_targets.py $tmp/tmp_sess/ali_vad_targets_wk.JOB.ark "ark,t,scp:$outdir/dense_targets.JOB.ark,$outdir/dense_targets.JOB.scp" \ + $tmp/tmp_sess/utt2spk_shuf1.JOB $tmp/tmp_sess/utt2spk_shuf2.JOB $tmp/tmp_sess/utt2spk_shuf3.JOB $tmp/tmp_sess/utt2spk_shuf4.JOB \ + data/${srcdata}_hires/segments $tmp/segments_wk_ali data/${srcdata}_hires/utt2num_frames || exit 1; + cat $outdir/dense_targets.*.scp | sort > $targets + + # some diagnostics + compute-cmvn-stats scp:$outdir/dense_targets.1.scp - | cmvn-to-nnet - $outdir/S03.cmvn.nnet + compute-cmvn-stats scp:$outdir/dense_targets.2.scp - | cmvn-to-nnet - $outdir/S03_sp0.9.cmvn.nnet + compute-cmvn-stats scp:$outdir/dense_targets.3.scp - | cmvn-to-nnet - $outdir/S03_sp1.1.cmvn.nnet + + touch $outdir/.done + fi +fi + +if [ $stage -le 14 ]; then + mark=$dir/.done_cfg + if [ ! -f $mark ]; then + echo "Creating neural net configs using the xconfig parser" + feat_dim=40 + num_targets=8 + mkdir -p $dir/configs + output_opts="l2-regularize=$l2o" + lstm_opts="l2-regularize=$l2" + linear_opts="l2-regularize=$l2 orthonormal-constraint=-1.0" + cnn_opts="l2-regularize=$l2" + + rproj=128 + nproj=32 + cell=896 + cat < $dir/configs/network.xconfig + input dim=400 name=ivector + input dim=${feat_dim} name=input + idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat + batchnorm-component name=batchnorm input=idct + + stats-layer name=mean config=mean(-150:1:1:150) input=batchnorm + no-op-component name=batchnorm-cmn input=Sum(batchnorm,Scale(-1.0,mean)) + + no-op-component name=ivector-all input=ReplaceIndex(ivector,t,0) + dim-range-component name=ivector-1 input=ivector-all dim=100 dim-offset=0 + dim-range-component name=ivector-2 input=ivector-all dim=100 dim-offset=100 + dim-range-component name=ivector-3 input=ivector-all dim=100 dim-offset=200 + dim-range-component name=ivector-4 input=ivector-all dim=100 dim-offset=300 + + combine-feature-maps-layer name=combine_inputs input=Append(batchnorm, batchnorm-cmn) num-filters1=1 num-filters2=1 height=$feat_dim + conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128 + conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128 + + linear-component $linear_opts name=aff1 input=Append(cnn4,ivector-1) dim=$((3*rproj)) + fast-lstmp-layer name=blstm1-1-forward input=aff1 cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=-1 $lstm_opts + fast-lstmp-layer name=blstm1-1-backward input=aff1 cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=1 $lstm_opts + fast-lstmp-layer name=blstm2-1-forward input=Append(blstm1-1-forward, blstm1-1-backward) cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=-2 $lstm_opts + fast-lstmp-layer name=blstm2-1-backward input=Append(blstm1-1-forward, blstm1-1-backward) cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=2 $lstm_opts + + linear-component $linear_opts name=aff2 input=Append(cnn4,ivector-2) dim=$((3*rproj)) + fast-lstmp-layer name=blstm1-2-forward input=aff2 cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=-1 $lstm_opts + fast-lstmp-layer name=blstm1-2-backward input=aff2 cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=1 $lstm_opts + fast-lstmp-layer name=blstm2-2-forward input=Append(blstm1-2-forward, blstm1-2-backward) cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=-2 $lstm_opts + fast-lstmp-layer name=blstm2-2-backward input=Append(blstm1-2-forward, blstm1-2-backward) cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=2 $lstm_opts + + linear-component $linear_opts name=aff3 input=Append(cnn4,ivector-3) dim=$((3*rproj)) + fast-lstmp-layer name=blstm1-3-forward input=aff3 cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=-1 $lstm_opts + fast-lstmp-layer name=blstm1-3-backward input=aff3 cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=1 $lstm_opts + fast-lstmp-layer name=blstm2-3-forward input=Append(blstm1-3-forward, blstm1-3-backward) cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=-2 $lstm_opts + fast-lstmp-layer name=blstm2-3-backward input=Append(blstm1-3-forward, blstm1-3-backward) cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=2 $lstm_opts + + linear-component $linear_opts name=aff4 input=Append(cnn4,ivector-4) dim=$((3*rproj)) + fast-lstmp-layer name=blstm1-4-forward input=aff4 cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=-1 $lstm_opts + fast-lstmp-layer name=blstm1-4-backward input=aff4 cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=1 $lstm_opts + fast-lstmp-layer name=blstm2-4-forward input=Append(blstm1-4-forward, blstm1-4-backward) cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=-2 $lstm_opts + fast-lstmp-layer name=blstm2-4-backward input=Append(blstm1-4-forward, blstm1-4-backward) cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=2 $lstm_opts + + fast-lstmp-layer name=blstm3-forward input=Append(blstm2-1-forward, blstm2-1-backward, blstm2-2-forward, blstm2-2-backward, blstm2-3-forward, blstm2-3-backward, blstm2-4-forward, blstm2-4-backward) cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=-3 $lstm_opts + fast-lstmp-layer name=blstm3-backward input=Append(blstm2-1-forward, blstm2-1-backward, blstm2-2-forward, blstm2-2-backward, blstm2-3-forward, blstm2-3-backward, blstm2-4-forward, blstm2-4-backward) cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=3 $lstm_opts + + output-layer $output_opts input=Append(blstm3-forward, blstm3-backward) name=output dim=2 + output-layer $output_opts input=Append(blstm3-forward, blstm3-backward) name=output2 dim=2 + output-layer $output_opts input=Append(blstm3-forward, blstm3-backward) name=output3 dim=2 + output-layer $output_opts input=Append(blstm3-forward, blstm3-backward) name=output4 dim=2 +EOF + steps/nnet3/xconfig_to_configs.py \ + --xconfig-file $dir/configs/network.xconfig \ + --config-dir $dir/configs || exit 1 + echo "num_targets=$num_targets" >> $dir/configs/vars + + echo "Modifying final.config file to combine 4 softmax layers in the output-node " + sed -i 's:output\.:output1\.:g' $dir/configs/final.config + mv $dir/configs/final.config $dir/configs/final.config.tmp + grep -v "output\-node" $dir/configs/final.config.tmp > $dir/configs/final.config + echo "output-node name=output input=Append(output1.log-softmax, output2.log-softmax, output3.log-softmax, output4.log-softmax)" >> $dir/configs/final.config + + echo "Modifying final.config file to enforce weigths sharing in affine and blstm layers" + sed -i s:component\ name=aff1:component\ name=aff-uni: $dir/configs/final.config + sed -i s:component=aff1:component=aff-uni: $dir/configs/final.config + sed -i s:component=aff2:component=aff-uni: $dir/configs/final.config + sed -i s:component=aff3:component=aff-uni: $dir/configs/final.config + sed -i s:component=aff4:component=aff-uni: $dir/configs/final.config + sed -i s:component\ name=blstm1-1:component\ name=blstm1-uni: $dir/configs/final.config + sed -i s:component\ name=blstm2-1:component\ name=blstm2-uni: $dir/configs/final.config + sed -i s:component=blstm1-1:component=blstm1-uni: $dir/configs/final.config + sed -i s:component=blstm1-2:component=blstm1-uni: $dir/configs/final.config + sed -i s:component=blstm1-3:component=blstm1-uni: $dir/configs/final.config + sed -i s:component=blstm1-4:component=blstm1-uni: $dir/configs/final.config + sed -i s:component=blstm2-1:component=blstm2-uni: $dir/configs/final.config + sed -i s:component=blstm2-2:component=blstm2-uni: $dir/configs/final.config + sed -i s:component=blstm2-3:component=blstm2-uni: $dir/configs/final.config + sed -i s:component=blstm2-4:component=blstm2-uni: $dir/configs/final.config + mv $dir/configs/final.config $dir/configs/final.config.tmp + grep -v "component\ name=aff2" $dir/configs/final.config.tmp | grep -v "component\ name=aff3" | grep -v "component\ name=aff4" | \ + grep -v "component\ name=blstm1-2" | grep -v "component\ name=blstm1-3" | grep -v "component\ name=blstm1-4" | \ + grep -v "component\ name=blstm2-2" | grep -v "component\ name=blstm2-3" | grep -v "component\ name=blstm2-4" > $dir/configs/final.config + nnet3-init --binary=false $dir/configs/final.config $dir/configs/init.raw || exit 1; + touch $mark + fi +fi + +if [ ! -f data/$data/utt2uniq.done ]; then + [ -f data/$data/utt2uniq ] && mv data/$data/utt2uniq data/$data/utt2uniq.bak + local/ts-vad/make_utt2uniq.pl data/$data/utt2spk data/$data/utt2uniq || exit 1; + touch data/$data/utt2uniq.done +fi + +if [ $stage -le 15 ]; then + mark=$dir/.done_dnn + if [ ! -f $mark ]; then + cp "$(readlink -f $0)" "$dir" + steps/nnet3/train_raw_rnn.py \ + --stage=$train_stage \ + --cmd="$train_cmd" \ + --feat.online-ivector-dir=$iv4dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2 \ + --trainer.num-epochs=$num_epochs \ + --trainer.optimization.proportional-shrink 10 \ + --trainer.optimization.momentum=0.5 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.$lrate \ + --trainer.optimization.final-effective-lrate=0.0$lrate \ + --trainer.rnn.num-chunk-per-minibatch=128 \ + --trainer.samples-per-iter=15000 \ + --egs.chunk-left-context=30 \ + --egs.chunk-right-context=30 \ + --egs.chunk-width=40 \ + --use-dense-targets true \ + --feat-dir data/$data \ + --targets-scp $targets \ + --egs.cmd=run.pl \ + --egs.dir=$common_egs_dir \ + --cleanup.remove-egs false \ + --cleanup.preserve-model-interval=100 \ + --use-gpu=true \ + --dir=$dir || exit 1 + touch $mark + fi +fi + +echo Done diff --git a/egs/chime6/s5b_track2/local/ts-vad/apply_map.pl b/egs/chime6/s5b_track2/local/ts-vad/apply_map.pl new file mode 100755 index 00000000000..6a61cf647cb --- /dev/null +++ b/egs/chime6/s5b_track2/local/ts-vad/apply_map.pl @@ -0,0 +1,98 @@ +#!/usr/bin/env perl +use warnings; #sed replacement for -w perl parameter +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + +# This program is a bit like ./sym2int.pl in that it applies a map +# to things in a file, but it's a bit more general in that it doesn't +# assume the things being mapped to are single tokens, they could +# be sequences of tokens. See the usage message. +# Compared to the utils/apply_map.pl, permissive mode is fixed. + +$permissive = 0; + +for ($x = 0; $x <= 2; $x++) { + + if (@ARGV > 0 && $ARGV[0] eq "-f") { + shift @ARGV; + $field_spec = shift @ARGV; + if ($field_spec =~ m/^\d+$/) { + $field_begin = $field_spec - 1; $field_end = $field_spec - 1; + } + if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) + if ($1 ne "") { + $field_begin = $1 - 1; # Change to zero-based indexing. + } + if ($2 ne "") { + $field_end = $2 - 1; # Change to zero-based indexing. + } + } + if (!defined $field_begin && !defined $field_end) { + die "Bad argument to -f option: $field_spec"; + } + } + + if (@ARGV > 0 && $ARGV[0] eq '--permissive') { + shift @ARGV; + # Mapping is optional (missing key is printed to output) + $permissive = 1; + } +} + +if(@ARGV != 1) { + print STDERR "Invalid usage: " . join(" ", @ARGV) . "\n"; + print STDERR <<'EOF'; +Usage: apply_map.pl [options] map output + options: [-f ] [--permissive] + This applies a map to some specified fields of some input text: + For each line in the map file: the first field is the thing we + map from, and the remaining fields are the sequence we map it to. + The -f (field-range) option says which fields of the input file the map + map should apply to. + If the --permissive option is supplied, fields which are not present + in the map will be left as they were. + Applies the map 'map' to all input text, where each line of the map + is interpreted as a map from the first field to the list of the other fields + Note: can look like 4-5, or 4-, or 5-, or 1, it means the field + range in the input to apply the map to. + e.g.: echo A B | apply_map.pl a.txt + where a.txt is: + A a1 a2 + B b + will produce: + a1 a2 b +EOF + exit(1); +} + +($map_file) = @ARGV; +open(M, "<$map_file") || die "Error opening map file $map_file: $!"; + +while () { + @A = split(" ", $_); + @A >= 1 || die "apply_map.pl: empty line."; + $i = shift @A; + $o = join(" ", @A); + $map{$i} = $o; +} + +CU: while() { + @A = split(" ", $_); + for ($x = 0; $x < @A; $x++) { + if ( (!defined $field_begin || $x >= $field_begin) + && (!defined $field_end || $x <= $field_end)) { + $a = $A[$x]; + if (!defined $map{$a}) { + if (!$permissive) { + die "apply_map.pl: undefined key $a in $map_file\n"; + } else { + print STDERR "apply_map.pl: warning! missing key $a in $map_file\n"; + next CU; + } + } else { + $A[$x] = $map{$a}; + } + } + } + print join(" ", @A) . "\n"; +} diff --git a/egs/chime6/s5b_track2/local/ts-vad/compute_ts-vad_weights.sh b/egs/chime6/s5b_track2/local/ts-vad/compute_ts-vad_weights.sh index 9aeb40d83df..54e6673ad16 100755 --- a/egs/chime6/s5b_track2/local/ts-vad/compute_ts-vad_weights.sh +++ b/egs/chime6/s5b_track2/local/ts-vad/compute_ts-vad_weights.sh @@ -7,7 +7,7 @@ # 2020 Ivan Medennikov (STC-innovations Ltd) # Apache 2.0 -# Computes training alignments using nnet3 DNN, with output to lattices. +# Computes TS-VAD weights using raw nnet3 network. # Begin configuration section. nj=4 diff --git a/egs/chime6/s5b_track2/local/ts-vad/conv_ali_to_vad_012.py b/egs/chime6/s5b_track2/local/ts-vad/conv_ali_to_vad_012.py new file mode 100644 index 00000000000..291ca3e8d67 --- /dev/null +++ b/egs/chime6/s5b_track2/local/ts-vad/conv_ali_to_vad_012.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Yuri Khokhlov, Ivan Medennikov (STC-innovations Ltd) +# Apache 2.0. + +"""This script transforms phone-indices in alignment to 0(silence phones), 1(speech phones), 2(spn phones)""" + +import os +import argparse +import numpy as np + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Usage: conv_ali_to_vad_012.py 1:2:3:4:5 6:7:8:9:10 ') + parser.add_argument('silence_phones', type=str) + parser.add_argument('spn_phones', type=str) + parser.add_argument('phone_ali', type=str) + parser.add_argument('vad_ali', type=str) + args = parser.parse_args() + + print('Options:') + print(' Silence phones (colon-separated list): {}'.format(args.silence_phones)) + print(' Spoken-noise phones (colon-separated list): {}'.format(args.spn_phones)) + print(' Input phone ali in text format: {}'.format(args.phone_ali)) + print(' Output vad ali in text format: {}'.format(args.vad_ali)) + + silence_set = set(args.silence_phones.split(':')) + print("sil phones: ") + print(args.silence_phones.split(':')) + spn_set = set(args.spn_phones.split(':')) + print("spn phones: ") + print(args.spn_phones.split(':')) + + assert os.path.exists(args.phone_ali), 'File does not exist {}'.format(args.phone_ali) + parent = os.path.dirname(os.path.abspath(args.vad_ali)) + if not os.path.exists(parent): + os.makedirs(parent) + + print('Starting to convert') + count = 0 + with open(args.phone_ali) as ali_file: + with open(args.vad_ali, 'wt') as vad_file: + for line in ali_file: + line = line.strip() + if len(line) == 0: + continue + parts = line.split(' ') + parts = list(filter(None, parts)) + assert len(parts) > 1, 'Empty alignment in line {}'.format(line) + vad_file.write('{}'.format(parts[0])) + phones = parts[1:] + for phone in phones: + if phone in silence_set: + vad_file.write(' 0') + elif phone in spn_set: + vad_file.write(' 2') + else: + vad_file.write(' 1') + vad_file.write('\n') + count += 1 + vad_file.close() + ali_file.close() + print('Converted alignments for {} utterances'.format(count)) + diff --git a/egs/chime6/s5b_track2/local/ts-vad/conv_vad_to_dense_targets.py b/egs/chime6/s5b_track2/local/ts-vad/conv_vad_to_dense_targets.py new file mode 100644 index 00000000000..cc5bbf4d93e --- /dev/null +++ b/egs/chime6/s5b_track2/local/ts-vad/conv_vad_to_dense_targets.py @@ -0,0 +1,336 @@ +#!/usr/bin/env python +# Copyright 2020 Ivan Medennikov, Maxim Korenevsky (STC-innovations Ltd) +# Apache 2.0. + +"""This script prepares overlapped 4-speaker dense targets for TS-VAD training + using segments and VAD alignment of kinect and worn utterances + (VAD alignment from worn utterances is more reliable than from kinects, + so we use large scaling factor for worn utterances). + The resulting targets are 4 pairs of probabilities: + (sil_spk1, speech_spk1, sil_spk2, speech_spk2, sil_spk3, speech_spk3, sil_spk4, speech_spk4)""" + +import os +import argparse +import numpy as np +from kaldiio import WriteHelper + +def ProcessSession(segments, writer, worn_scale): + segments.sort(key = lambda tup: tup[6]) #sort by start time + for i in range(len(segments)): + utt_id, spk, n_spk, n_spk2, n_spk3, n_spk4, start, end, vad_info, device = segments[i] + if n_spk == '': + continue + + vad_info_dense = np.zeros((vad_info[0],8)) + + # looking for left-side overlappings + i1 = i-1 + cnt = 0 + nls=0 + nl1=0 + nl2=0 + nl3=0 + nl4=0 + while i1>=0: + utt_id1, spk1, x, y, z, q, start1, end1, vad_info1, device1 = segments[i1] + if x != '': + i1 -= 1 + continue + if end1 > start: + scale = 1 + if device1 == 'W': + scale = worn_scale + if spk1 == spk: + nls+=1 + if spk1 == n_spk: + nl1+=1 + #print('utt {}, spk {}: left intersection with spk 1 utt {}, adding scale {}'.format(utt_id,spk1,utt_id1,scale)) + for k in range(min(end,end1)-start): + if vad_info1[start - start1 + k] == '1': + vad_info_dense[k][1] += scale + else: + vad_info_dense[k][0] += scale + elif spk1 == n_spk2: + nl2+=1 + #print('utt {}, spk {}: left intersection with spk 2 utt {}, adding scale {}'.format(utt_id,spk1,utt_id1,scale)) + for k in range(min(end,end1)-start): + if vad_info1[start - start1 + k] == '1': + vad_info_dense[k][3] += scale + else: + vad_info_dense[k][2] += scale + elif spk1 == n_spk3: + nl3+=1 + #print('utt {}, spk {}: left intersection with spk 3 utt {}, adding scale {}'.format(utt_id,spk1,utt_id1,scale)) + for k in range(min(end,end1)-start): + if vad_info1[start - start1 + k] == '1': + vad_info_dense[k][5] += scale + else: + vad_info_dense[k][4] += scale + elif spk1 == n_spk4: + nl4+=1 + #print('utt {}, spk {}: left intersection with spk 4 utt {}, adding scale {}'.format(utt_id,spk1,utt_id1,scale)) + for k in range(min(end,end1)-start): + if vad_info1[start - start1 + k] == '1': + vad_info_dense[k][7] += scale + else: + vad_info_dense[k][6] += scale + else: + cnt += 1 + if cnt==10: + break + i1 -= 1 + + # looking for rights-side overlappings + i1 = i+1 + cnt = 0 + nrs=0 + nr1=0 + nr2=0 + nr3=0 + nr4=0 + while i1 start1: + scale = 1 + if device1 == 'W': + scale = worn_scale + if spk1 == spk: + nrs+=1 + if spk1 == n_spk: + nr1+=1 + #print('utt {}, spk {}: right intersection with spk 1 utt {}, adding scale {}'.format(utt_id,spk1,utt_id1,scale)) + for k in range(min(end, end1)-start1): + if vad_info1[k] == '1': + vad_info_dense[start1-start+k][1] += scale + else: + vad_info_dense[start1-start+k][0] += scale + elif spk1 == n_spk2: + nr2+=1 + #print('utt {}, spk {}: right intersection with spk 2 utt {}, adding scale {}'.format(utt_id,spk1,utt_id1,scale)) + for k in range(min(end, end1)-start1): + if vad_info1[k] == '1': + vad_info_dense[start1-start+k][3] += scale + else: + vad_info_dense[start1-start+k][2] += scale + elif spk1 == n_spk3: + nr3+=1 + #print('utt {}, spk {}: right intersection with spk 3 utt {}, adding scale {}'.format(utt_id,spk1,utt_id1,scale)) + for k in range(min(end, end1)-start1): + if vad_info1[k] == '1': + vad_info_dense[start1-start+k][5] += scale + else: + vad_info_dense[start1-start+k][4] += scale + elif spk1 == n_spk4: + nr4+=1 + #print('utt {}, spk {}: right intersection with spk 4 utt {}, adding scale {}'.format(utt_id,spk1,utt_id1,scale)) + for k in range(min(end, end1)-start1): + if vad_info1[k] == '1': + vad_info_dense[start1-start+k][7] += scale + else: + vad_info_dense[start1-start+k][6] += scale + else: + cnt += 1 + if cnt==10: + break + i1 += 1 + + for j in range(vad_info[0]): + for head in range(4): + total=vad_info_dense[j][2*head]+vad_info_dense[j][2*head+1] + if total == 0: + vad_info_dense[j][2*head] = 1 + vad_info_dense[j][2*head+1] = 0 + else: + vad_info_dense[j][2*head] /= total + vad_info_dense[j][2*head+1] /= total + + #print("utt {}: {}+{}+{}+{} left-overlaps and {} left-self-overlaps, {}+{}+{}+{} right-overlaps and {} rights-self-overlaps".format(utt_id,nl1,nl2,nl3,nl4,nls,nr1,nr2,nr3,nr4,nrs)) + if nls == 0 and nrs == 0: + print("WARNING: utt {} does not have targets!".format(utt_id)) + continue + writer(utt_id, vad_info_dense) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Usage: conv_vad_to_dense_targets.py ') + parser.add_argument('vad_ali', type=str) + parser.add_argument('wspec', type=str) + parser.add_argument('utt2spk_n1', type=str) + parser.add_argument('utt2spk_n2', type=str) + parser.add_argument('utt2spk_n3', type=str) + parser.add_argument('utt2spk_n4', type=str) + parser.add_argument('segments_utt', type=str) + parser.add_argument('segments_ali', type=str) + parser.add_argument('utt2dur', type=str) + parser.add_argument('--worn_scale', type=float, default=10) + + args = parser.parse_args() + + print('Options:') + print(' Input vad ali in text format: {}'.format(args.vad_ali)) + print(' Output wspecifier: {}'.format(args.wspec)) + print(' Utterance-to-spk map for head #1: {}'.format(args.utt2spk_n1)) + print(' Utterance-to-spk map for head #2: {}'.format(args.utt2spk_n2)) + print(' Utterance-to-spk map for head #3: {}'.format(args.utt2spk_n3)) + print(' Utterance-to-spk map for head #4: {}'.format(args.utt2spk_n4)) + print(' Segments for uid: {}'.format(args.segments_utt)) + print(' Segments for ali: {}'.format(args.segments_ali)) + print(' Utt2dur (in frames) for uid: {}'.format(args.utt2dur)) + print(' Worn scaling factor: {}'.format(args.worn_scale)) + + assert os.path.exists(args.vad_ali), 'File does not exist {}'.format(args.vad_ali) + + print('Starting to convert') + + print('Loading speaker info for head #1') + n_speakers=dict() + with open(args.utt2spk_n1,'r') as f: + for line in f: + uid, n_sid = line.strip().split() + n_spk = n_sid.split('_')[0] + n_spk_parts = n_spk.split('-')[:-2] + if n_spk_parts[0][:3]=='rev': + del n_spk_parts[0] + if len(n_spk_parts)>1 and n_spk_parts[1][:3]=='rev': + del n_spk_parts[1] + n_spk = '-'.join(n_spk_parts) + n_speakers[uid] = n_spk + + print('Loading speaker info for head #2') + n_speakers2=dict() + with open(args.utt2spk_n2,'r') as f: + for line in f: + uid, n_sid = line.strip().split() + n_spk = n_sid.split('_')[0] + n_spk_parts = n_spk.split('-')[:-2] + if n_spk_parts[0][:3]=='rev': + del n_spk_parts[0] + if len(n_spk_parts)>1 and n_spk_parts[1][:3]=='rev': + del n_spk_parts[1] + n_spk = '-'.join(n_spk_parts) + n_speakers2[uid] = n_spk + + print('Loading speaker info for head #3') + n_speakers3=dict() + with open(args.utt2spk_n3,'r') as f: + for line in f: + uid, n_sid = line.strip().split() + n_spk = n_sid.split('_')[0] + n_spk_parts = n_spk.split('-')[:-2] + if n_spk_parts[0][:3]=='rev': + del n_spk_parts[0] + if len(n_spk_parts)>1 and n_spk_parts[1][:3]=='rev': + del n_spk_parts[1] + n_spk = '-'.join(n_spk_parts) + n_speakers3[uid] = n_spk + + print('Loading speaker info for head #4') + n_speakers4=dict() + with open(args.utt2spk_n4,'r') as f: + for line in f: + uid, n_sid = line.strip().split() + n_spk = n_sid.split('_')[0] + n_spk_parts = n_spk.split('-')[:-2] + if n_spk_parts[0][:3]=='rev': + del n_spk_parts[0] + if len(n_spk_parts)>1 and n_spk_parts[1][:3]=='rev': + del n_spk_parts[1] + n_spk = '-'.join(n_spk_parts) + n_speakers4[uid] = n_spk + + print('Loading segments boundaries') + seg_by_uid = dict() + with open(args.segments_utt) as f: + for line in f: + uid, wav_id, start, end = line.strip().split() + seg_by_uid[uid]=(int(float(start)*100),int(float(end)*100)) + + print('Loading durations of utterances') + len_by_uid = dict() + with open(args.utt2dur) as f: + for line in f: + uid, length = line.strip().split() + len_by_uid[uid]=int(length) + + print('Loading VAD alignment segments boundaries') + seg_by_ali_uid = dict() + with open(args.segments_ali) as f: + for line in f: + utt_id, wav_id, start, end = line.strip().split() + seg_by_ali_uid[utt_id]=(int(float(start)*100),int(float(end)*100)) + + print('Loading VAD alignment') + seg_by_sess = dict() + with open(args.vad_ali) as f: + for line in f: + vad_info = line.strip().split() + utt_id_ = vad_info[0] + vad_info = vad_info[1:] + + utt_id = utt_id_ + utt_id_parts=utt_id.split('-') + utt_id = '-'.join(utt_id_parts[:-3]) + spk, sess, device = utt_id.split('_')[:3] + spk_parts = spk.split('-') + if spk_parts[0][:3]=='rev': + del spk_parts[0] + if len(spk_parts)>1 and spk_parts[1][:3]=='rev': + del spk_parts[1] + spk = '-'.join(spk_parts) + + if device == 'NOLOCATION.L' or device == 'NOLOCATION.R': + device = 'W' + + if sess not in seg_by_sess: + seg_by_sess[sess] = list() + start, end = seg_by_ali_uid[utt_id_] + + assert end-start >= len(vad_info), '{} {} {}'.format(start, end, len(vad_info)) + assert end-start-len(vad_info)<=3, '{} {} {}'.format(start, end, len(vad_info)) + end = start + len(vad_info) + seg_by_sess[sess].append((utt_id_, spk, '', '', '', '', start, end, vad_info, device)) + + skip=0 + for uid in n_speakers.keys(): + n_spk = n_speakers[uid].split('_')[0] + if uid not in n_speakers2.keys(): + skip+=1 + continue + n_spk2 = n_speakers2[uid].split('_')[0] + if uid not in n_speakers3.keys(): + skip+=1 + continue + n_spk3 = n_speakers3[uid].split('_')[0] + if uid not in n_speakers4.keys(): + skip+=1 + continue + n_spk4 = n_speakers4[uid].split('_')[0] + spk, sess = uid.split('_')[:2] + spk_parts = spk.split('-') + if spk_parts[0][:3]=='rev': + del spk_parts[0] + if len(spk_parts)>1 and spk_parts[1][:3]=='rev': + del spk_parts[1] + spk = '-'.join(spk_parts) + if uid not in seg_by_uid.keys(): + skip+=1 + continue + start, end = seg_by_uid[uid] + if uid not in len_by_uid.keys(): + skip+=1 + continue + vad_info = len_by_uid[uid] + assert end-start >= vad_info, '{} {} {}'.format(start, end, vad_info) + assert end-start-vad_info<=3, '{} {} {}'.format(start, end, vad_info) + end = start + vad_info + + seg_by_sess[sess].append((uid, spk, n_spk, n_spk2, n_spk3, n_spk4, start, end, [vad_info], 'kinect')) + + print('{} utts are skipped as missing in utt2spk'.format(skip)) + print('Processing segments session-by-session') + with WriteHelper(args.wspec) as writer: + for sess in seg_by_sess: + print(sess) + ProcessSession(seg_by_sess[sess], writer, args.worn_scale) \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/ts-vad/extract_ivectors.sh b/egs/chime6/s5b_track2/local/ts-vad/extract_ivectors.sh new file mode 100755 index 00000000000..e9b3b95c178 --- /dev/null +++ b/egs/chime6/s5b_track2/local/ts-vad/extract_ivectors.sh @@ -0,0 +1,296 @@ +#!/bin/bash + +# Copyright 2013 Daniel Povey +# Apache 2.0. + + +# This script computes iVectors in the same format as extract_ivectors_online.sh, +# except that they are actually not really computed online, they are first computed +# per speaker and just duplicated many times. +# This is mainly intended for use in decoding, where you want the best possible +# quality of iVectors. +# +# This setup also makes it possible to use a previous decoding or alignment, to +# down-weight silence in the stats (default is --silence-weight 0.0). +# +# This is for when you use the "online-decoding" setup in an offline task, and +# you want the best possible results. +# Compared to the steps/online/nnet2/extract_ivectors.sh, this script uses +# modified apply_map.pl with permissive mode. + +# Begin configuration section. +nj=30 +cmd="run.pl" +stage=0 +num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select +min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out) +ivector_period=10 +posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for + # inter-frame correlations. Making this small during iVector + # extraction is equivalent to scaling up the prior, and will + # will tend to produce smaller iVectors where data-counts are + # small. It's not so important that this match the value + # used when training the iVector extractor, but more important + # that this match the value used when you do real online decoding + # with the neural nets trained with these iVectors. +max_count=100 # Interpret this as a number of frames times posterior scale... + # this config ensures that once the count exceeds this (i.e. + # 1000 frames, or 10 seconds, by default), we start to scale + # down the stats, accentuating the prior term. This seems quite + # important for some reason. +sub_speaker_frames=0 # If >0, during iVector estimation we split each speaker + # into possibly many 'sub-speakers', each with at least + # this many frames of speech (evaluated after applying + # silence_weight, so will typically exclude silence. + # e.g. set this to 1000, and it will require at least 10 seconds + # of speech per sub-speaker. + +compress=true # If true, compress the iVectors stored on disk (it's lossy + # compression, as used for feature matrices). +silence_weight=0.0 +acwt=0.1 # used if input is a decode dir, to get best path from lattices. +mdl=final # change this if decode directory did not have ../final.mdl present. +num_threads=1 # Number of threads used by ivector-extract. It is usually not + # helpful to set this to > 1. It is only useful if you have + # fewer speakers than the number of jobs you want to run. + +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 4 ] && [ $# != 5 ]; then + echo "Usage: $0 [options] [||] " + echo " e.g.: $0 data/test data/lang exp/nnet2_online/extractor exp/tri3/decode_test exp/nnet2_online/ivectors_test" + echo "If is provided, it is converted to frame-weights " + echo "giving silence frames a weight of --silence-weight (default: 0.0). " + echo "If is provided, it must be a single archive file compressed " + echo "(using gunzip) containing per-frame weights for each utterance." + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --nj # Number of jobs (also see num-threads)" + echo " --num-threads # Number of threads for each job" + echo " # Ignored if or supplied." + echo " --stage # To control partial reruns" + echo " --num-gselect # Number of Gaussians to select using" + echo " # diagonal model." + echo " --min-post # Pruning threshold for posteriors" + echo " --ivector-period # How often to extract an iVector (frames)" + echo " --posterior-scale # Scale on posteriors in iVector extraction; " + echo " # affects strength of prior term." + + exit 1; +fi + +if [ $# -eq 4 ]; then + data=$1 + lang=$2 + srcdir=$3 + dir=$4 +else # 5 arguments + data=$1 + lang=$2 + srcdir=$3 + ali_or_decode_dir_or_weights=$4 + dir=$5 +fi + +for f in $data/feats.scp $srcdir/final.ie $srcdir/final.dubm $srcdir/global_cmvn.stats $srcdir/splice_opts \ + $lang/phones.txt $srcdir/online_cmvn.conf $srcdir/final.mat; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +mkdir -p $dir/log +silphonelist=$(cat $lang/phones/silence.csl) || exit 1; + +if [ ! -z "$ali_or_decode_dir_or_weights" ]; then + + + if [ -f $ali_or_decode_dir_or_weights/ali.1.gz ]; then + if [ ! -f $ali_or_decode_dir_or_weights/${mdl}.mdl ]; then + echo "$0: expected $ali_or_decode_dir_or_weights/${mdl}.mdl to exist." + exit 1; + fi + nj_orig=$(cat $ali_or_decode_dir_or_weights/num_jobs) || exit 1; + + if [ $stage -le 0 ]; then + rm $dir/weights.*.gz 2>/dev/null + + $cmd JOB=1:$nj_orig $dir/log/ali_to_post.JOB.log \ + gunzip -c $ali_or_decode_dir_or_weights/ali.JOB.gz \| \ + ali-to-post ark:- ark:- \| \ + weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir_or_weights/final.mdl ark:- ark:- \| \ + post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1; + + # put all the weights in one archive. + for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1; + rm $dir/weights.*.gz || exit 1; + fi + + elif [ -f $ali_or_decode_dir_or_weights/lat.1.gz ]; then + nj_orig=$(cat $ali_or_decode_dir_or_weights/num_jobs) || exit 1; + if [ ! -f $ali_or_decode_dir_or_weights/../${mdl}.mdl ]; then + echo "$0: expected $ali_or_decode_dir_or_weights/../${mdl}.mdl to exist." + exit 1; + fi + + + if [ $stage -le 0 ]; then + rm $dir/weights.*.gz 2>/dev/null + + $cmd JOB=1:$nj_orig $dir/log/lat_to_post.JOB.log \ + lattice-best-path --acoustic-scale=$acwt "ark:gunzip -c $ali_or_decode_dir_or_weights/lat.JOB.gz|" ark:/dev/null ark:- \| \ + ali-to-post ark:- ark:- \| \ + weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir_or_weights/../${mdl}.mdl ark:- ark:- \| \ + post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1; + + # put all the weights in one archive. + for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1; + rm $dir/weights.*.gz || exit 1; + fi + elif [ -f $ali_or_decode_dir_or_weights ] && gunzip -c $ali_or_decode_dir_or_weights >/dev/null; then + cp $ali_or_decode_dir_or_weights $dir/weights.gz || exit 1; + else + echo "$0: expected ali.1.gz or lat.1.gz to exist in $ali_or_decode_dir_or_weights"; + exit 1; + fi +fi + +sdata=$data/split$nj; +utils/split_data.sh $data $nj || exit 1; + +echo $ivector_period > $dir/ivector_period || exit 1; +splice_opts=$(cat $srcdir/splice_opts) + +gmm_feats="ark,s,cs:apply-cmvn-online --spk2utt=ark:$sdata/JOB/spk2utt --config=$srcdir/online_cmvn.conf $srcdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" +feats="ark,s,cs:splice-feats $splice_opts scp:$sdata/JOB/feats.scp ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + +# This adds online-cmvn in $feats, upon request (configuration taken from UBM), +[ -f $srcdir/online_cmvn_iextractor ] && feats="$gmm_feats" + + +if [ $sub_speaker_frames -gt 0 ]; then + + if [ $stage -le 1 ]; then + # We work out 'fake' spk2utt files that possibly split each speaker into multiple pieces. + if [ ! -z "$ali_or_decode_dir_or_weights" ]; then + gunzip -c $dir/weights.gz | copy-vector ark:- ark,t:- | \ + awk '{ sum=0; for (n=3;n $dir/utt_counts || exit 1; + else + feat-to-len scp:$data/feats.scp ark,t:- > $dir/utt_counts || exit 1; + fi + if ! [ $(wc -l <$dir/utt_counts) -eq $(wc -l <$data/feats.scp) ]; then + echo "$0: error getting per-utterance counts." +# exit 0; + fi +# cat $data/spk2utt | python -c " + utils/filter_scp.pl $dir/utt_counts $data/utt2spk | utils/utt2spk_to_spk2utt.pl | python -c " +import sys +utt_counts = {} +trash = list(map(lambda x: utt_counts.update({x.split()[0]:float(x.split()[1])}), open('$dir/utt_counts').readlines())) +sub_speaker_frames = $sub_speaker_frames +lines = sys.stdin.readlines() +total_counts = {} +for line in lines: + parts = line.split() + spk = parts[0] + total_counts[spk] = 0 + for utt in parts[1:]: + total_counts[spk] += utt_counts[utt] + +for line_index in range(len(lines)): + line = lines[line_index] + parts = line.split() + spk = parts[0] + + numeric_id=0 + current_count = 0 + covered_count = 0 + current_utts = [] + for utt in parts[1:]: + try: + current_count += utt_counts[utt] + covered_count += utt_counts[utt] + except KeyError: + raise Exception('No count found for the utterance {0}.'.format(utt)) + current_utts.append(utt) + if ((current_count >= $sub_speaker_frames) and ((total_counts[spk] - covered_count) >= $sub_speaker_frames)) or (utt == parts[-1]): + spk_partial = '{0}-{1:06x}'.format(spk, numeric_id) + numeric_id += 1 + print ('{0} {1}'.format(spk_partial, ' '.join(current_utts))) + current_utts = [] + current_count = 0 +"> $dir/spk2utt || exit 1; + mkdir -p $dir/split$nj + # create split versions of our spk2utt file. + for j in $(seq $nj); do + mkdir -p $dir/split$nj/$j + utils/filter_scp.pl -f 2 $sdata/$j/utt2spk <$dir/spk2utt >$dir/split$nj/$j/spk2utt || exit 1; + utils/spk2utt_to_utt2spk.pl <$dir/split$nj/$j/spk2utt >$dir/split$nj/$j/utt2spk || exit 1; + done + fi + this_sdata=$dir/split$nj +else + this_sdata=$sdata +fi + +if [ $stage -le 2 ]; then + if [ ! -z "$ali_or_decode_dir_or_weights" ]; then + $cmd --num-threads $num_threads JOB=1:$nj $dir/log/extract_ivectors.JOB.log \ + gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \ + weight-post ark:- "ark,s,cs:gunzip -c $dir/weights.gz|" ark:- \| \ + ivector-extract --num-threads=$num_threads --acoustic-weight=$posterior_scale --compute-objf-change=true \ + --max-count=$max_count --spk2utt=ark:$this_sdata/JOB/spk2utt \ + $srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_spk.JOB.ark || exit 1; + else + $cmd --num-threads $num_threads JOB=1:$nj $dir/log/extract_ivectors.JOB.log \ + gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \ + ivector-extract --num-threads=$num_threads --acoustic-weight=$posterior_scale --compute-objf-change=true \ + --max-count=$max_count --spk2utt=ark:$this_sdata/JOB/spk2utt \ + $srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_spk.JOB.ark || exit 1; + fi +fi + +# get an utterance-level set of iVectors (just duplicate the speaker-level ones). +# note: if $this_sdata is set $dir/split$nj, then these won't be real speakers, they'll +# be "sub-speakers" (speakers split up into multiple utterances). +if [ $stage -le 3 ]; then + for j in $(seq $nj); do + local/ts-vad/apply_map.pl --permissive -f 2 $dir/ivectors_spk.$j.ark <$this_sdata/$j/utt2spk >$dir/ivectors_utt.$j.ark || exit 1; + done +fi + +ivector_dim=$[$(head -n 1 $dir/ivectors_spk.1.ark | wc -w) - 3] || exit 1; +echo "$0: iVector dim is $ivector_dim" + +base_feat_dim=$(feat-to-dim scp:$data/feats.scp -) || exit 1; + +start_dim=$base_feat_dim +end_dim=$[$base_feat_dim+$ivector_dim-1] +absdir=$(utils/make_absolute.sh $dir) + +if [ $stage -le 4 ]; then + # here, we are just using the original features in $sdata/JOB/feats.scp for + # their number of rows; we use the select-feats command to remove those + # features and retain only the iVector features. + $cmd JOB=1:$nj $dir/log/duplicate_feats.JOB.log \ + append-vector-to-feats scp:$sdata/JOB/feats.scp ark:$dir/ivectors_utt.JOB.ark ark:- \| \ + select-feats "$start_dim-$end_dim" ark:- ark:- \| \ + subsample-feats --n=$ivector_period ark:- ark:- \| \ + copy-feats --compress=$compress ark:- \ + ark,scp:$absdir/ivector_online.JOB.ark,$absdir/ivector_online.JOB.scp || exit 1; +fi + +if [ $stage -le 5 ]; then + echo "$0: combining iVectors across jobs" + for j in $(seq $nj); do cat $dir/ivector_online.$j.scp; done >$dir/ivector_online.scp || exit 1; +fi + +steps/nnet2/get_ivector_id.sh $srcdir > $dir/final.ie.id || exit 1 + +echo "$0: done extracting (pseudo-online) iVectors to $dir using the extractor in $srcdir." + diff --git a/egs/chime6/s5b_track2/local/ts-vad/make_json_align.py b/egs/chime6/s5b_track2/local/ts-vad/make_json_align.py new file mode 100755 index 00000000000..1d13a85a425 --- /dev/null +++ b/egs/chime6/s5b_track2/local/ts-vad/make_json_align.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Yuri Khokhlov, Ivan Medennikov (STC-innovations Ltd) +# Apache 2.0. + +"""This script converts JSON to VAD alignment. +Single-speaker speech is treated as 1, whereas both silence and overlapped speech as 0. +This can be used for excluding overlapping regions from i-vectors estimation""" + +import os +import json +import datetime +import argparse +import numpy as np +from pathlib import Path +from kaldiio import WriteHelper + +def time_to_seconds(time): + parts = time.split(':') + return datetime.timedelta(hours=float(parts[0]), minutes=float(parts[1]), seconds=float(parts[2])).total_seconds() + +class Segment: + def __init__(self, begin, end, label): + self.begin = begin + self.end = end + self.label = label + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Usage: make_json_align.py ') + parser.add_argument("--frequency", "-f", type=int, default=16000) + parser.add_argument("--frame_len", "-l", type=float, default=0.025) + parser.add_argument("--frame_shift", "-s", type=float, default=0.010) + parser.add_argument('json_path', type=str) + parser.add_argument('ali_wspec', type=str) + args = parser.parse_args() + + frame_len = int(round(args.frame_len * args.frequency)) + frame_shift = int(round(args.frame_shift * args.frequency)) + + print('Options:') + print(' Sampling frequency: {}'.format(args.frequency)) + print(' Frame length in sec: {} ({})'.format(args.frame_len, frame_len)) + print(' Frame shift in sec: {} ({})'.format(args.frame_shift, frame_shift)) + print(' Path to the source JSON: {}'.format(args.json_path)) + print(' Alignment write specifier: {}'.format(args.ali_wspec)) + + json_path = Path(args.json_path) + assert os.path.isfile(args.json_path), 'File does not exist {}'.format(args.json_path) + + print('Loading file {}'.format(json_path)) + with open(str(json_path)) as stream: + data = json.load(stream) + stream.close() + print(' loaded {} segments'.format(len(data))) + + print('Building alignment') + duration = 0.0 + for reco in data: + duration = max(time_to_seconds(reco['end_time']), duration) + print(' session duration {:.2f} hrs'.format(duration / 60 / 60)) + total_frames = (int(duration * args.frequency) - frame_len) // frame_shift + 1 + print(' total number of frames {}'.format(total_frames)) + alignment = np.zeros(total_frames, dtype=np.int32) + for reco in data: + start_time = time_to_seconds(reco['start_time']) + end_time = time_to_seconds(reco['end_time']) + num_frames = (int((end_time - start_time) * args.frequency) - frame_len) // frame_shift + 1 + start_frame = int(start_time * args.frequency) // frame_shift + end_frame = min(start_frame + num_frames, total_frames) + alignment[start_frame: end_frame] += 1 + value = args.frame_shift * np.count_nonzero(alignment == 0) + print(' out of segments: {:.2f} hrs'.format(value / 60 / 60)) + value = args.frame_shift * np.count_nonzero(alignment == 1) + print(' single speaker: {:.2f} hrs'.format(value / 60 / 60)) + value = args.frame_shift * np.count_nonzero(alignment > 1) + print(' overlapped speech: {:.2f} hrs'.format(value / 60 / 60)) + + alignment = np.vectorize(lambda nspk: 0 if nspk > 1 else nspk)(alignment) + + print('Writing alignment to {}'.format(args.ali_wspec)) + with WriteHelper(args.ali_wspec) as writer: + writer(data[0]['session_id'], alignment) + writer.close() diff --git a/egs/chime6/s5b_track2/local/ts-vad/make_negative_utt2spk.pl b/egs/chime6/s5b_track2/local/ts-vad/make_negative_utt2spk.pl new file mode 100755 index 00000000000..a8128551d5c --- /dev/null +++ b/egs/chime6/s5b_track2/local/ts-vad/make_negative_utt2spk.pl @@ -0,0 +1,155 @@ +#!/usr/bin/perl +# Copyright 2020 Ivan Medennikov (STC-innovations Ltd) +# Apache 2.0. + +# This script creates 3 negative utt2spk files with speakers from the same session. + +($filein,$fileout,$fileout2,$fileout3)=@ARGV; + +$Nspk=4; + +%id2time2utt={}; +%utt2spk={}; +%utt2P={}; +%sid2spk={}; + +open(fidin, "<$filein") or die "can't open $filein : $!"; +while ($line=) +{ + $line=~s/\s+$//; + @items=split(/\s+/,$line); + $utt=$items[0]; + $spk=$items[1]; + $utt2spk{$utt}=$spk; + if ($utt=~/P(\d+)/) + { + $P=$1; + } + else + { + print "skipping utt $utt\n"; + next; + } + $utt2P{$utt}=$P; + if ($utt=~/S(\d+)/) + { + $S=$1; + } + else + { + print "skipping utt $utt\n"; + next; + } + if ($utt=~/\D(\d{7})-\d{7}/) + { + $beg=$1; + } + else + { + print "skipping utt $utt\n"; + next; + } + $type=0; + if ($utt=~/rev/) + { + $type=1; + } + elsif ($utt=~/\.L/) + { + $type=L; + } + elsif ($utt=~/\.R/) + { + $type=R; + } + elsif ($utt=~/(U\d+).+(CH\d+)/) + { + $type="$1_$2"; + } + if ($utt=~/(sp0.9)/) + { + $type="$1_$type"; + } + if ($utt=~/(sp1.1)/) + { + $type="$1_$type"; + } + $id="S$S\_$type"; + if ( not exists $id2time2utt{$id} ) + { + %{$id2time2utt{$id}}={}; + } + push(@{$id2time2utt{$id}{$beg}},$utt); + $sid="$P\_$type"; + push(@{$sid2spk{$sid}},$spk); +} +close(fidin); + + +open(fidout, ">$fileout") or die "can't open $fileout : $!"; +open(fidout2, ">$fileout2") or die "can't open $fileout2 : $!"; +open(fidout3, ">$fileout3") or die "can't open $fileout3 : $!"; +foreach $id(sort keys %{id2time2utt}) +{ + $type=""; + if ($id=~/S\d+\_(\S+)/) + { + $type=$1; + } + @utts=(); + %curspk={}; + foreach $time(sort keys %{$id2time2utt{$id}}) + { + foreach $utt (@{$id2time2utt{$id}{$time}}) + { + $P=$utt2P{$utt}; + if ($utt=~/^\s*$/) { next; } + if (not exists $curspk{$P}) { $curspk{$P}=$utt2spk{$utt}; } + push(@utts,$utt); + } + } + foreach $utt (@utts) + { + $P=$utt2P{$utt}; + $curspk{$P}=$utt2spk{$utt}; + $Plast=int(($P-1)/$Nspk)*$Nspk+$Nspk; + $P1=$P+1; + if ($P1 > $Plast) + { + $P1=$P1-$Nspk; + } + if ($P1<10) + { + $P1="0$P1"; + } + $P2=$P+2; + if ($P2 > $Plast) + { + $P2=$P2-$Nspk; + } + if ($P2<10) + { + $P2="0$P2"; + } + $P3=$P+3; + if ($P3 > $Plast) + { + $P3=$P3-$Nspk; + } + if ($P3<10) + { + $P3="0$P3"; + } + if ( not exists $curspk{$P1} ) { $sid=$P1."\_".$type; $cspk=$sid2spk{$sid}[rand @{$sid2spk{$sid}}]; print fidout "$utt $cspk\n"; } + else { print fidout "$utt $curspk{$P1}\n"; } + if ( not exists $curspk{$P2} ) { $sid=$P2."\_".$type; $cspk=$sid2spk{$sid}[rand @{$sid2spk{$sid}}]; print fidout2 "$utt $cspk\n"; } + else { print fidout2 "$utt $curspk{$P2}\n"; } + if ( not exists $curspk{$P3} ) { $sid=$P3."\_".$type; $cspk=$sid2spk{$sid}[rand @{$sid2spk{$sid}}]; print fidout3 "$utt $cspk\n"; } + else { print fidout3 "$utt $curspk{$P3}\n"; } + } +} + +close(fidout); +close(fidout2); +close(fidout3); +exit 0; \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/ts-vad/make_utt2uniq.pl b/egs/chime6/s5b_track2/local/ts-vad/make_utt2uniq.pl new file mode 100755 index 00000000000..94b55a6f598 --- /dev/null +++ b/egs/chime6/s5b_track2/local/ts-vad/make_utt2uniq.pl @@ -0,0 +1,51 @@ +#!/usr/bin/perl +# Copyright 2020 Ivan Medennikov (STC-innovations Ltd) +# Apache 2.0. + +# This script creates utt2uniq file for the CHiME-6 utterances. + +($filein,$fileout)=@ARGV; + +open(fidout, ">$fileout") or die "can't open $fileout : $!"; +open(fidin, "<$filein") or die "can't open $filein : $!"; +while ($line=) +{ + $line=~s/\s+$//; + @items=split(/\s+/,$line); + $utt=$items[0]; + $spk=$items[1]; + if ($utt=~/P(\d+)/) + { + $P=$1; + } + else + { + print "skipping utt $utt\n"; + next; + } + if ($utt=~/S(\d+)/) + { + $S=$1; + } + else + { + print "skipping utt $utt\n"; + next; + } + if ($utt=~/\D(\d{7})-(\d{7})/) + { + $beg=$1; + $end=$2; + } + else + { + print "skipping utt $utt\n"; + next; + } + $id="P$P\_S$S\_$beg\-$end"; + print fidout "$utt $id\n"; +} +close(fidin); +close(fidout); + +exit 0; \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/ts-vad/modify_ups_utt2spk.pl b/egs/chime6/s5b_track2/local/ts-vad/modify_ups_utt2spk.pl index 5c220332c9b..bc53308e021 100755 --- a/egs/chime6/s5b_track2/local/ts-vad/modify_ups_utt2spk.pl +++ b/egs/chime6/s5b_track2/local/ts-vad/modify_ups_utt2spk.pl @@ -2,6 +2,9 @@ # Copyright 2020 Ivan Medennikov (STC-innovations Ltd) # Apache 2.0. +# This script splits speakers in utt2spk file, +# leaving $ups utterances for each sub-speaker. + ($filein,$ups,$fileout)=@ARGV; open(fidin, "<$filein") or die "can't open $filein : $!"; diff --git a/egs/chime6/s5b_track2/local/ts-vad/prepare_json_weights.pl b/egs/chime6/s5b_track2/local/ts-vad/prepare_json_weights.pl new file mode 100755 index 00000000000..f58e28fcc96 --- /dev/null +++ b/egs/chime6/s5b_track2/local/ts-vad/prepare_json_weights.pl @@ -0,0 +1,57 @@ +#!/usr/bin/perl +# Copyright 2020 Ivan Medennikov (STC-innovations Ltd) +# Apache 2.0. + +# This script creates per-utterance json alignment by per-session alignment and segments. + +($segments,$jsonali_scp,$jsonali_scp_perutt)=@ARGV; + +%ark={}; + +open(fidin, "<$jsonali_scp") or die "can't open $jsonali_scp : $!"; +while ($line=) +{ + $line=~s/\s+$//; + @items=split(/\s+/,$line); + $ark{$items[0]}=$items[1]; + print "$items[0] $ark{$items[0]}\n"; +} +close(fidin); + +open(fidin, "<$segments") or die "can't open $segments : $!"; +open(fidout, ">$jsonali_scp_perutt") or die "can't open $jsonali_scp_perutt : $!"; +while ($line=) +{ + $line=~s/\s+$//; + @items=split(/\s+/,$line); + $utt=$items[0]; + $wav=$items[1]; + $beg=$items[2]; + $end=$items[3]; + if ($utt=~/_(S\d+).*(\d{7})-(\d{7})/) + { + $sess=$1; + $ubeg=$2; + $ubeg=~s/^0+//; + if ($utt=~/sp(\d+\.\d+)/) + { + $sp=$1; + $ubeg=int($ubeg/$sp+0.5); + $sess=$sess."_sp$sp"; + } + if (($utt=~/^$wav\-\d+$/) || ($utt=~/^$wav$/)) + { + $beg=$ubeg+int($beg*100+0.5); + $end=$ubeg+int($end*100+0.5)-1; + } + else + { + $beg=int($beg*100+0.5); + $end=int($end*100+0.5)-1; + } + print fidout "$utt $ark{$sess}\[$beg\:$end\]\n"; + } +} +close(fidin); +close(fidout); +exit 0; \ No newline at end of file diff --git a/egs/chime6/s5b_track2/local/ts-vad/shuffle_4spk_scp_utt2spk.pl b/egs/chime6/s5b_track2/local/ts-vad/shuffle_4spk_scp_utt2spk.pl new file mode 100755 index 00000000000..6821609ab8e --- /dev/null +++ b/egs/chime6/s5b_track2/local/ts-vad/shuffle_4spk_scp_utt2spk.pl @@ -0,0 +1,151 @@ +#!/usr/bin/perl +# Copyright 2020 Ivan Medennikov (STC-innovations Ltd) +# Apache 2.0. + +# This script takes 4 scp files $filein{1,2,3,4} with the same utterance-ids, +# and produces 4 shuffled versions of them. +# Moreover, the same shuffling is performed with 4 utt2spk files $utt2spk{1,2,3,4}. + +use List::Util qw(shuffle); + +($filein1,$filein2,$filein3,$filein4,$fileout1,$fileout2,$fileout3,$fileout4,$utt2spk1,$utt2spk2,$utt2spk3,$utt2spk4,$out1,$out2,$out3,$out4)=@ARGV; + +%utt2arks={}; +%utt2spk1={}; +open(fidin, "<$utt2spk1") or die "can't open $utt2spk1 : $!"; +while ($line=) +{ + $line=~s/\s+$//; + @items=split(/\s+/,$line); + $utt=$items[0]; + $spk=$items[1]; + $utt2spk1{$utt}=$spk; +} +close(fidin); + +%utt2spk2={}; +open(fidin, "<$utt2spk2") or die "can't open $utt2spk2 : $!"; +while ($line=) +{ + $line=~s/\s+$//; + @items=split(/\s+/,$line); + $utt=$items[0]; + $spk=$items[1]; + $utt2spk2{$utt}=$spk; +} +close(fidin); + +%utt2spk3={}; +open(fidin, "<$utt2spk3") or die "can't open $utt2spk3 : $!"; +while ($line=) +{ + $line=~s/\s+$//; + @items=split(/\s+/,$line); + $utt=$items[0]; + $spk=$items[1]; + $utt2spk3{$utt}=$spk; +} +close(fidin); + +%utt2spk4={}; +open(fidin, "<$utt2spk4") or die "can't open $utt2spk4 : $!"; +while ($line=) +{ + $line=~s/\s+$//; + @items=split(/\s+/,$line); + $utt=$items[0]; + $spk=$items[1]; + $utt2spk4{$utt}=$spk; +} +close(fidin); + + +open(fidin, "<$filein1") or die "can't open $filein1 : $!"; +while ($line=) +{ + $line=~s/\s+$//; + @items=split(/\s+/,$line, 2); + $utt=$items[0]; + $ark=$items[1]; + push(@{$utt2arks{$utt}},"$utt2spk1{$utt} $ark"); +} +close(fidin); + +open(fidin, "<$filein2") or die "can't open $filein2 : $!"; +while ($line=) +{ + $line=~s/\s+$//; + @items=split(/\s+/,$line, 2); + $utt=$items[0]; + $ark=$items[1]; + push(@{$utt2arks{$utt}},"$utt2spk2{$utt} $ark"); +} +close(fidin); + +open(fidin, "<$filein3") or die "can't open $filein3 : $!"; +while ($line=) +{ + $line=~s/\s+$//; + @items=split(/\s+/,$line, 2); + $utt=$items[0]; + $ark=$items[1]; + push(@{$utt2arks{$utt}},"$utt2spk3{$utt} $ark"); +} +close(fidin); + +open(fidin, "<$filein4") or die "can't open $filein4 : $!"; +while ($line=) +{ + $line=~s/\s+$//; + @items=split(/\s+/,$line, 2); + $utt=$items[0]; + $ark=$items[1]; + push(@{$utt2arks{$utt}},"$utt2spk4{$utt} $ark"); +} +close(fidin); + +open(fidout1, ">$fileout1") or die "can't open $fileout1 : $!"; +open(fidout2, ">$fileout2") or die "can't open $fileout2 : $!"; +open(fidout3, ">$fileout3") or die "can't open $fileout3 : $!"; +open(fidout4, ">$fileout4") or die "can't open $fileout4 : $!"; + +open(out1, ">$out1") or die "can't open $out1 : $!"; +open(out2, ">$out2") or die "can't open $out2 : $!"; +open(out3, ">$out3") or die "can't open $out3 : $!"; +open(out4, ">$out4") or die "can't open $out4 : $!"; + +foreach $utt(sort(keys %utt2arks)) +{ + if (scalar(@{$utt2arks{$utt}}) < 4) + { + next; + } + @shf = shuffle(@{$utt2arks{$utt}}); + @u1 = split(/\s+/, $shf[0], 2); + @u2 = split(/\s+/, $shf[1], 2); + @u3 = split(/\s+/, $shf[2], 2); + @u4 = split(/\s+/, $shf[3], 2); + + print fidout1 "$utt $u1[1]\n"; + print fidout2 "$utt $u2[1]\n"; + print fidout3 "$utt $u3[1]\n"; + print fidout4 "$utt $u4[1]\n"; + + print out1 "$utt $u1[0]\n"; + print out2 "$utt $u2[0]\n"; + print out3 "$utt $u3[0]\n"; + print out4 "$utt $u4[0]\n"; + +} +close(fidout1); +close(fidout2); +close(fidout3); +close(fidout4); + +close(out1); +close(out2); +close(out3); +close(out4); + +exit 0; + diff --git a/egs/chime6/s5b_track2/run.sh b/egs/chime6/s5b_track2/run.sh index 4bbd9c59d58..7a271a787e4 100755 --- a/egs/chime6/s5b_track2/run.sh +++ b/egs/chime6/s5b_track2/run.sh @@ -17,6 +17,7 @@ nnet_stage=-10 sad_stage=0 diarizer_stage=0 decode_stage=0 +ts_vad_stage=0 enhancement=beamformit # for a new enhancement method, # change this variable and decode stage decode_only=false @@ -25,12 +26,6 @@ snrs="20:10:15:5:0" foreground_snrs="20:10:15:5:0" background_snrs="20:10:15:5:0" -# pre-trained TS-VAD model -ts_vad_name=ts-vad_b.tar.gz -ts_vad_link=https://github.com/yuri-hohlov/ts-vad-data/raw/master/${ts_vad_name} -ts_vad_dir=exp/ts-vad_b -ivector_dir=exp/nnet3_b - # End configuration section . ./utils/parse_options.sh @@ -38,7 +33,7 @@ ivector_dir=exp/nnet3_b . ./path.sh if [ $decode_only == "true" ]; then - stage=18 + stage=19 fi set -e # exit on error @@ -57,6 +52,10 @@ train_set=train_worn_simu_u400k sad_train_set=train_worn_u400k test_sets="dev_${enhancement}_dereverb eval_${enhancement}_dereverb" +# TS-VAD options +ts_vad_dir=exp/ts-vad_1a +ivector_dir=exp/nnet3_${train_set}_cleaned_rvb + # This script also needs the phonetisaurus g2p, srilm, beamformit ./local/check_tools.sh || exit 1; @@ -291,16 +290,28 @@ if [ $stage -le 17 ]; then --model-dir exp/xvector_nnet_1a fi +########################################################################## +# TS-VAD MODEL TRAINING +# You can also download a pretrained diarization model using: +# ts_vad_name=ts-vad_1a.tar.gz +# ts_vad_link=https://github.com/yuri-hohlov/ts-vad-data/raw/master/${ts_vad_name} +# [ ! -f $ts_vad_name ] && wget -O $ts_vad_name $ts_vad_link +# [ ! -d $ts_vad_dir ] && tar -zxvf $ts_vad_name -C $(dirname $ts_vad_dir) +########################################################################## +if [ $stage -le 18 ]; then + local/train_ts-vad.sh --stage $ts_vad_stage \ + --nnet3-affix _${train_set}_cleaned_rvb \ + --basedata ${train_set}_cleaned_sp +fi + ########################################################################## # DECODING: In track 2, we are given raw utterances without segment # or speaker information, so we have to decode the whole pipeline, i.e., -# SAD -> Diarization (x-vectors + Spectral Clustering) -> TS-VAD Diarization -# GSS -> ASR. +# SAD -> Diarization (x-vectors + Spectral Clustering) -> +# 3 iterations of TS-VAD Diarization -> GSS -> ASR. # This is done in the local/decode_ts-vad.sh script. ########################################################################## -if [ $stage -le 18 ]; then - [ ! -f $ts_vad_name ] && wget -O $ts_vad_name $ts_vad_link - [ ! -d $ts_vad_dir ] && tar -zxvf $ts_vad_name -C $(dirname $ts_vad_dir) +if [ $stage -le 19 ]; then local/decode_ts-vad.sh --stage $decode_stage \ --ts-vad-dir $ts_vad_dir --ivector-dir $ivector_dir \ --enhancement $enhancement \ diff --git a/src/featbin/Makefile b/src/featbin/Makefile index 861ba3f7a93..614cc77d80d 100644 --- a/src/featbin/Makefile +++ b/src/featbin/Makefile @@ -17,7 +17,7 @@ BINFILES = add-deltas add-deltas-sdc append-post-to-feats \ process-kaldi-pitch-feats process-pitch-feats \ select-feats shift-feats splice-feats subsample-feats \ subset-feats transform-feats wav-copy wav-reverberate \ - wav-to-duration + wav-to-duration multiply-vectors paste-vectors OBJFILES = diff --git a/src/featbin/multiply-vectors.cc b/src/featbin/multiply-vectors.cc new file mode 100644 index 00000000000..70c9e6dcf63 --- /dev/null +++ b/src/featbin/multiply-vectors.cc @@ -0,0 +1,173 @@ +// featbin/multiply-vectors.cc + +// Copyright 2012 Korbinian Riedhammer +// 2013 Brno University of Technology (Author: Karel Vesely) +// 2013 Johns Hopkins University (Author: Daniel Povey) +// 2020 Ivan Medennikov (STC-innovations Ltd) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "matrix/kaldi-matrix.h" + +namespace kaldi { + +// returns true if successfully multiplied. +bool MultiplyVectors(const std::vector > &in, + std::string utt, + int32 tolerance, + Vector *out) { + // Check the lengths + int32 min_len = in[0].Dim(), + max_len = in[0].Dim(); + for (int32 i = 1; i < in.size(); i++) { + int32 len = in[i].Dim(); + if(len < min_len) min_len = len; + if(len > max_len) max_len = len; + } + if (max_len - min_len > tolerance || min_len == 0) { + KALDI_WARN << "Length mismatch " << max_len << " vs. " << min_len + << (utt.empty() ? "" : " for utt ") << utt + << " exceeds tolerance " << tolerance; + out->Resize(0); + return false; + } + if (max_len - min_len > 0) { + KALDI_VLOG(2) << "Length mismatch " << max_len << " vs. " << min_len + << (utt.empty() ? "" : " for utt ") << utt + << " within tolerance " << tolerance; + } + out->Resize(min_len); + out->Set(1.0); + for (int32 i = 0; i < in.size(); i++) { + out->MulElements(in[i].Range(0, min_len)); + } + return true; +} + + +} + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace std; + + const char *usage = + "Multiply vectors frame-by-frame (assuming they have about the same durations, see --length-tolerance);\n" + "Usage: multiply-vectors [ ...] \n" + " or: multiply-vectors [ ...] \n" + " e.g. multiply-vectors ark:vec1.ark ark:vec2.ark ark:out.ark\n" + " or: multiply-vectors foo.mat bar.mat baz.mat\n" + "See also: paste-feats, copy-vector, append-vector-to-feats\n"; + + ParseOptions po(usage); + + int32 length_tolerance = 0; + bool binary = true; + po.Register("length-tolerance", &length_tolerance, + "If length is different, trim as shortest up to a frame " + " difference of length-tolerance, otherwise exclude segment."); + po.Register("binary", &binary, "If true, output files in binary " + "(only relevant for single-file operation, i.e. no tables)"); + + po.Read(argc, argv); + + if (po.NumArgs() < 3) { + po.PrintUsage(); + exit(1); + } + + if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) + != kNoRspecifier) { + // We're operating on tables, e.g. archives. + + // Last argument is output + string wspecifier = po.GetArg(po.NumArgs()); + BaseFloatVectorWriter vector_writer(wspecifier); + + // First input is sequential + string rspecifier1 = po.GetArg(1); + SequentialBaseFloatVectorReader input1(rspecifier1); + + // Assemble vector of other input readers (with random-access) + vector input; + for (int32 i = 2; i < po.NumArgs(); i++) { + string rspecifier = po.GetArg(i); + RandomAccessBaseFloatVectorReader *rd = new RandomAccessBaseFloatVectorReader(rspecifier); + input.push_back(rd); + } + + int32 num_done = 0, num_err = 0; + + // Main loop + for (; !input1.Done(); input1.Next()) { + string utt = input1.Key(); + KALDI_VLOG(2) << "Multiplying vectors for utterance " << utt; + + // Collect features from streams to vector 'vectors' + vector > vectors(po.NumArgs() - 1); + vectors[0] = input1.Value(); + int32 i; + for (i = 0; i < static_cast(input.size()); i++) { + if (input[i]->HasKey(utt)) { + vectors[i + 1] = input[i]->Value(utt); + } else { + KALDI_WARN << "Missing utt " << utt << " from input " + << po.GetArg(i+2); + num_err++; + break; + } + } + if (i != static_cast(input.size())) + continue; + Vector output; + if (!MultiplyVectors(vectors, utt, length_tolerance, &output)) { + num_err++; + continue; // it will have printed a warning. + } + vector_writer.Write(utt, output); + num_done++; + } + + for (int32 i=0; i < input.size(); i++) + delete input[i]; + input.clear(); + + KALDI_LOG << "Done " << num_done << " utts, errors on " + << num_err; + + return (num_done == 0 ? -1 : 0); + } else { + // We're operating on rxfilenames|wxfilenames, most likely files. + std::vector > vectors(po.NumArgs() - 1); + for (int32 i = 1; i < po.NumArgs(); i++) + ReadKaldiObject(po.GetArg(i), &(vectors[i-1])); + Vector output; + if (!MultiplyVectors(vectors, "", length_tolerance, &output)) + return 1; // it will have printed a warning. + std::string output_wxfilename = po.GetArg(po.NumArgs()); + WriteKaldiObject(output, output_wxfilename, binary); + KALDI_LOG << "Wrote multiplied vector to " << output_wxfilename; + return 0; + } + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/featbin/paste-vectors.cc b/src/featbin/paste-vectors.cc new file mode 100644 index 00000000000..06d373ba7e0 --- /dev/null +++ b/src/featbin/paste-vectors.cc @@ -0,0 +1,138 @@ +// featbin/paste-vectors.cc + +// Copyright 2012 Korbinian Riedhammer +// 2013 Brno University of Technology (Author: Karel Vesely) +// 2013 Johns Hopkins University (Author: Daniel Povey) +// 2020 Ivan Medennikov (STC-innovation Ltd) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "matrix/kaldi-matrix.h" + +namespace kaldi { + +// returns true if successfully appended. +bool AppendVectors(const std::vector > &in, + std::string utt, + Vector *out) { + // Check the lengths + int32 tot_dim = in[0].Dim(); + for (int32 i = 1; i < in.size(); i++) { + int32 dim = in[i].Dim(); + tot_dim += dim; + } + out->Resize(tot_dim); + int32 dim_offset = 0; + for (int32 i = 0; i < in.size(); i++) { + int32 this_dim = in[i].Dim(); + out->Range(dim_offset, this_dim).CopyFromVec( + in[i].Range(0, this_dim)); + dim_offset += this_dim; + } + return true; +} + + +} + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace std; + + const char *usage = + "Paste vector files \n" + "Usage: paste-vectors [ ...] \n" + "See also: paste-feats, copy-feats, copy-matrix, append-vector-to-feats, concat-feats\n"; + + ParseOptions po(usage); + + po.Read(argc, argv); + + if (po.NumArgs() < 3) { + po.PrintUsage(); + exit(1); + } + + if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) + != kNoRspecifier) { + // We're operating on tables, e.g. archives. + + // Last argument is output + string wspecifier = po.GetArg(po.NumArgs()); + BaseFloatVectorWriter vec_writer(wspecifier); + + // First input is sequential + string rspecifier1 = po.GetArg(1); + SequentialBaseFloatVectorReader input1(rspecifier1); + + // Assemble vector of other input readers (with random-access) + vector input; + for (int32 i = 2; i < po.NumArgs(); i++) { + string rspecifier = po.GetArg(i); + RandomAccessBaseFloatVectorReader *rd = new RandomAccessBaseFloatVectorReader(rspecifier); + input.push_back(rd); + } + + int32 num_done = 0, num_err = 0; + + // Main loop + for (; !input1.Done(); input1.Next()) { + string utt = input1.Key(); + KALDI_VLOG(2) << "Merging vectors for utterance " << utt; + + // Collect vectors from streams to vector 'vectors' + vector > vectors(po.NumArgs() - 1); + vectors[0] = input1.Value(); + int32 i; + for (i = 0; i < static_cast(input.size()); i++) { + if (input[i]->HasKey(utt)) { + vectors[i + 1] = input[i]->Value(utt); + } else { + KALDI_WARN << "Missing utt " << utt << " from input " + << po.GetArg(i+2); + num_err++; + break; + } + } + if (i != static_cast(input.size())) + continue; + Vector output; + if (!AppendVectors(vectors, utt, &output)) { + num_err++; + continue; // it will have printed a warning. + } + vec_writer.Write(utt, output); + num_done++; + } + + for (int32 i=0; i < input.size(); i++) + delete input[i]; + input.clear(); + + KALDI_LOG << "Done " << num_done << " utts, errors on " + << num_err; + + return (num_done == 0 ? -1 : 0); + } + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +}