Merge pull request kaldi-asr#27 from chimechallenge/ts-vad

TS-VAD diarization
desh2608 · Jul 13, 2020 · ff8edb9 · ff8edb9
2 parents cc990a9 + fe4740e
commit ff8edb9
Show file tree

Hide file tree

Showing 91 changed files with 7,004 additions and 1 deletion.
diff --git a/egs/callhome_diarization/v1/diarization/calc_cossim_scores.py b/egs/callhome_diarization/v1/diarization/calc_cossim_scores.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+
+# Copyright  2020  Maxim Korenevsky (STC-innovations Ltd)
+# Apache 2.0.
+
+import argparse
+import numpy as np
+from scipy.spatial.distance import cosine, pdist, squareform
+from kaldiio import ReadHelper, WriteHelper
+
+
+def LoadReco2Utt(file):
+    if ':' in file:
+        file = file.split(':')[1]
+    IDs=dict()
+    with open(file,'r') as f:
+        for line in f:
+            ids = line.strip().split()
+            IDs[ids[0]] = ids[1:]
+    return IDs
+
+def ReadXvecs(rspec):
+    xvecs=dict()
+    with ReadHelper(rspec) as reader:
+        for utid, xvec in reader:
+            xvecs[utid] = xvec
+    reader.close()
+    return xvecs
+
+def Normalize(xvecs_in):
+    N = len(xvecs_in)
+    xvec_mean=np.zeros(xvecs_in[0].shape)
+    for i in range(N):
+        xvec_mean += xvecs_in[i]
+    xvec_mean /= N
+    xvecs = np.copy(xvecs_in)
+    for i in range(N):
+        xvecs[i] -= xvec_mean
+        xvecs[i] = xvecs[i] / np.linalg.norm(xvecs[i])
+    return xvecs
+
+def CalcCosSim(vecs):
+    return 1 - squareform(pdist(np.asarray(vecs), 'cosine'))
+
+def WriteDistMatrices(D, wspec):
+    with WriteHelper(wspec) as writer:
+        for id in D:
+            writer(id, D[id])
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Usage: calc_cossim_scores.py <reco2utt-rspec> <xvec-rspec> <simmat-wspec>\nComputes matrices of the cosine similarity scores between normalized x-vectors for each recording')
+    parser.add_argument('reco2utt', type=str, help='Kaldi-style rspecifier of recording to segments correspondence')
+    parser.add_argument('xvec_rspec', type=str, help='Kaldi-style rspecifier of segment xvectors to read')
+    parser.add_argument('simmat_wspec', type=str, help='Kaldi-style wspecifier of similarity matrices to write')
+    args = parser.parse_args()
+
+
+    print('Computing cosine similarity matrix between ivectors')
+    print('Parameters:')
+    print('Reco2Utt rspecifier: {}'.format(args.reco2utt))
+    print('Xvectors rspecifier: {}'.format(args.xvec_rspec))
+    print('Similarity matrices wspecifier: {}'.format(args.simmat_wspec))
+
+    IDs = LoadReco2Utt(args.reco2utt)
+    xvecs_all = ReadXvecs(args.xvec_rspec)
+    D = dict()
+    for reco_id in IDs:
+        xvecs = [ xvecs_all[id] for id in IDs[reco_id] ]
+        xvecs = Normalize(xvecs)                              # !!!! Normalize per recording (session) !!!!
+        D[reco_id] = CalcCosSim(xvecs)
+    WriteDistMatrices(D, args.simmat_wspec)
diff --git a/egs/callhome_diarization/v1/diarization/scluster.sh b/egs/callhome_diarization/v1/diarization/scluster.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+
+# Copyright       2016  David Snyder
+#            2017-2018  Matthew Maciejewski
+#                 2020  Maxim Korenevsky (STC-innovations Ltd)
+# Apache 2.0.
+
+# This script performs spectral clustering using scored
+# pairs of subsegments and produces a rttm file with speaker
+# labels derived from the clusters.
+
+# Begin configuration section.
+cmd="run.pl"
+stage=0
+nj=10
+cleanup=true
+rttm_channel=0
+reco2num_spk=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <src-dir> <dir>"
+  echo " e.g.: $0 exp/ivectors_callhome exp/ivectors_callhome/results"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --nj <n|10>                                      # Number of jobs (also see num-processes and num-threads)"
+  echo "  --stage <stage|0>                                # To control partial reruns"
+  echo "  --rttm-channel <rttm-channel|0>                  # The value passed into the RTTM channel field. Only affects"
+  echo "                                                   # the format of the RTTM file."
+  echo "  --reco2num-spk <reco2num-spk-file>               # File containing mapping of recording ID"
+  echo "                                                   # to number of speakers. Used instead of threshold"
+  echo "                                                   # as stopping criterion if supplied."
+  echo "  --cleanup <bool|false>                           # If true, remove temporary files"
+  exit 1;
+fi
+
+srcdir=$1
+dir=$2
+
+reco2num_spk_opts=
+if [ ! $reco2num_spk == "" ]; then
+  reco2num_spk_opts="--reco2num-spk $reco2num_spk"
+fi
+
+mkdir -p $dir/tmp
+
+for f in $srcdir/scores.scp $srcdir/spk2utt $srcdir/utt2spk $srcdir/segments ; do
+  [ ! -f $f ] && echo "No such file $f" && exit 1;
+done
+
+cp $srcdir/spk2utt $dir/tmp/
+cp $srcdir/utt2spk $dir/tmp/
+cp $srcdir/segments $dir/tmp/
+utils/fix_data_dir.sh $dir/tmp > /dev/null
+
+if [ ! -z $reco2num_spk ]; then
+  reco2num_spk="ark,t:$reco2num_spk"
+fi
+
+sdata=$dir/tmp/split$nj;
+utils/split_data.sh $dir/tmp $nj || exit 1;
+
+# Set various variables.
+mkdir -p $dir/log
+
+feats="utils/filter_scp.pl $sdata/JOB/spk2utt $srcdir/scores.scp |"
+if [ $stage -le 0 ]; then
+  echo "$0: clustering scores"
+  for j in `seq $nj`; do 
+    utils/filter_scp.pl $sdata/$j/spk2utt $srcdir/scores.scp > $dir/scores.$j.scp
+  done
+  $cmd JOB=1:$nj $dir/log/spectral_cluster.JOB.log \
+    python diarization/spec_clust.py $reco2num_spk_opts \
+      scp:$dir/scores.JOB.scp ark,t:$sdata/JOB/spk2utt ark,t:$dir/labels.JOB || exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: combining labels"
+  for j in $(seq $nj); do cat $dir/labels.$j; done > $dir/labels || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: computing RTTM"
+  diarization/make_rttm.py --rttm-channel $rttm_channel $srcdir/segments $dir/labels $dir/rttm || exit 1;
+fi
+
+if $cleanup ; then
+  rm -r $dir/tmp || exit 1;
+fi
diff --git a/egs/callhome_diarization/v1/diarization/score_cossim.sh b/egs/callhome_diarization/v1/diarization/score_cossim.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+# Copyright  2016-2018  David Snyder
+#            2017-2018  Matthew Maciejewski
+#                 2020  Maxim Korenevsky (STC-innovations Ltd)
+# Apache 2.0.
+
+# This script is a modified version of diarization/score_plda.sh
+# that replaces i-vectors with x-vectors.
+#
+# This script computes cosine similarity scores from pairs of normalized x-vectors extracted
+# from segments of a recording.  These scores are in the form of
+# affinity matrices, one for each recording.  Most likely, the x-vectors
+# were computed using diarization/nnet3/xvector/extract_xvectors.sh.
+# The affinity matrices are most likely going to be clustered using
+# diarization/scluster.sh.
+
+# Begin configuration section.
+cmd="run.pl"
+stage=0
+target_energy=0.1
+nj=10
+cleanup=true
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <xvector-dir> <output-dir>"
+  echo " e.g.: $0 exp/xvectors_callhome_heldout exp/xvectors_callhome_test exp/xvectors_callhome_test"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --nj <n|10>                                      # Number of jobs (also see num-processes and num-threads)"
+  echo "  --stage <stage|0>                                # To control partial reruns"
+  echo "  --cleanup <bool|false>                           # If true, remove temporary files"
+  exit 1;
+fi
+
+xvecdir=$1
+dir=$2
+
+mkdir -p $dir/tmp
+
+for f in $xvecdir/xvector.scp $xvecdir/spk2utt $xvecdir/utt2spk $xvecdir/segments; do
+  [ ! -f $f ] && echo "No such file $f" && exit 1;
+done
+cp $xvecdir/xvector.scp $dir/tmp/feats.scp
+cp $xvecdir/spk2utt $dir/tmp/
+cp $xvecdir/utt2spk $dir/tmp/
+cp $xvecdir/segments $dir/tmp/
+cp $xvecdir/spk2utt $dir/
+cp $xvecdir/utt2spk $dir/
+cp $xvecdir/segments $dir/
+
+utils/fix_data_dir.sh $dir/tmp > /dev/null
+
+sdata=$dir/tmp/split$nj;
+utils/split_data.sh $dir/tmp $nj || exit 1;
+
+# Set various variables.
+mkdir -p $dir/log
+
+feats="scp:$sdata/JOB/feats.scp"
+if [ $stage -le 0 ]; then
+  echo "$0: scoring xvectors"
+  $cmd JOB=1:$nj $dir/log/cossim_scoring.JOB.log \
+      python diarization/calc_cossim_scores.py \
+      ark:$sdata/JOB/spk2utt "$feats" ark,scp:$dir/scores.JOB.ark,$dir/scores.JOB.scp || exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: combining cosine similarity scores across jobs"
+  for j in $(seq $nj); do cat $dir/scores.$j.scp; done >$dir/scores.scp || exit 1;
+fi
+
+if $cleanup ; then
+  rm -rf $dir/tmp || exit 1;
+fi