diff --git a/egs/callhome_diarization/v1/diarization/calc_cossim_scores.py b/egs/callhome_diarization/v1/diarization/calc_cossim_scores.py
new file mode 100644
index 00000000000..98ae683f606
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/calc_cossim_scores.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+
+# Copyright  2020  Maxim Korenevsky (STC-innovations Ltd)
+# Apache 2.0.
+
+import argparse
+import numpy as np
+from scipy.spatial.distance import cosine, pdist, squareform
+from kaldiio import ReadHelper, WriteHelper
+
+
+def LoadReco2Utt(file):
+    if ':' in file:
+        file = file.split(':')[1]
+    IDs=dict()
+    with open(file,'r') as f:
+        for line in f:
+            ids = line.strip().split()
+            IDs[ids[0]] = ids[1:]
+    return IDs
+
+def ReadXvecs(rspec):
+    xvecs=dict()
+    with ReadHelper(rspec) as reader:
+        for utid, xvec in reader:
+            xvecs[utid] = xvec
+    reader.close()
+    return xvecs
+
+def Normalize(xvecs_in):
+    N = len(xvecs_in)
+    xvec_mean=np.zeros(xvecs_in[0].shape)
+    for i in range(N):
+        xvec_mean += xvecs_in[i]
+    xvec_mean /= N
+    xvecs = np.copy(xvecs_in)
+    for i in range(N):
+        xvecs[i] -= xvec_mean
+        xvecs[i] = xvecs[i] / np.linalg.norm(xvecs[i])
+    return xvecs
+
+def CalcCosSim(vecs):
+    return 1 - squareform(pdist(np.asarray(vecs), 'cosine'))
+
+def WriteDistMatrices(D, wspec):
+    with WriteHelper(wspec) as writer:
+        for id in D:
+            writer(id, D[id])
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Usage: calc_cossim_scores.py <reco2utt-rspec> <xvec-rspec> <simmat-wspec>\nComputes matrices of the cosine similarity scores between normalized x-vectors for each recording')
+    parser.add_argument('reco2utt', type=str, help='Kaldi-style rspecifier of recording to segments correspondence')
+    parser.add_argument('xvec_rspec', type=str, help='Kaldi-style rspecifier of segment xvectors to read')
+    parser.add_argument('simmat_wspec', type=str, help='Kaldi-style wspecifier of similarity matrices to write')
+    args = parser.parse_args()
+
+
+    print('Computing cosine similarity matrix between ivectors')
+    print('Parameters:')
+    print('Reco2Utt rspecifier: {}'.format(args.reco2utt))
+    print('Xvectors rspecifier: {}'.format(args.xvec_rspec))
+    print('Similarity matrices wspecifier: {}'.format(args.simmat_wspec))
+
+    IDs = LoadReco2Utt(args.reco2utt)
+    xvecs_all = ReadXvecs(args.xvec_rspec)
+    D = dict()
+    for reco_id in IDs:
+        xvecs = [ xvecs_all[id] for id in IDs[reco_id] ]
+        xvecs = Normalize(xvecs)                              # !!!! Normalize per recording (session) !!!!
+        D[reco_id] = CalcCosSim(xvecs)
+    WriteDistMatrices(D, args.simmat_wspec)
diff --git a/egs/callhome_diarization/v1/diarization/scluster.sh b/egs/callhome_diarization/v1/diarization/scluster.sh
new file mode 100755
index 00000000000..51300315149
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/scluster.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+
+# Copyright       2016  David Snyder
+#            2017-2018  Matthew Maciejewski
+#                 2020  Maxim Korenevsky (STC-innovations Ltd)
+# Apache 2.0.
+
+# This script performs spectral clustering using scored
+# pairs of subsegments and produces a rttm file with speaker
+# labels derived from the clusters.
+
+# Begin configuration section.
+cmd="run.pl"
+stage=0
+nj=10
+cleanup=true
+rttm_channel=0
+reco2num_spk=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <src-dir> <dir>"
+  echo " e.g.: $0 exp/ivectors_callhome exp/ivectors_callhome/results"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --nj <n|10>                                      # Number of jobs (also see num-processes and num-threads)"
+  echo "  --stage <stage|0>                                # To control partial reruns"
+  echo "  --rttm-channel <rttm-channel|0>                  # The value passed into the RTTM channel field. Only affects"
+  echo "                                                   # the format of the RTTM file."
+  echo "  --reco2num-spk <reco2num-spk-file>               # File containing mapping of recording ID"
+  echo "                                                   # to number of speakers. Used instead of threshold"
+  echo "                                                   # as stopping criterion if supplied."
+  echo "  --cleanup <bool|false>                           # If true, remove temporary files"
+  exit 1;
+fi
+
+srcdir=$1
+dir=$2
+
+reco2num_spk_opts=
+if [ ! $reco2num_spk == "" ]; then
+  reco2num_spk_opts="--reco2num-spk $reco2num_spk"
+fi
+
+mkdir -p $dir/tmp
+
+for f in $srcdir/scores.scp $srcdir/spk2utt $srcdir/utt2spk $srcdir/segments ; do
+  [ ! -f $f ] && echo "No such file $f" && exit 1;
+done
+
+cp $srcdir/spk2utt $dir/tmp/
+cp $srcdir/utt2spk $dir/tmp/
+cp $srcdir/segments $dir/tmp/
+utils/fix_data_dir.sh $dir/tmp > /dev/null
+
+if [ ! -z $reco2num_spk ]; then
+  reco2num_spk="ark,t:$reco2num_spk"
+fi
+
+sdata=$dir/tmp/split$nj;
+utils/split_data.sh $dir/tmp $nj || exit 1;
+
+# Set various variables.
+mkdir -p $dir/log
+
+feats="utils/filter_scp.pl $sdata/JOB/spk2utt $srcdir/scores.scp |"
+if [ $stage -le 0 ]; then
+  echo "$0: clustering scores"
+  for j in `seq $nj`; do 
+    utils/filter_scp.pl $sdata/$j/spk2utt $srcdir/scores.scp > $dir/scores.$j.scp
+  done
+  $cmd JOB=1:$nj $dir/log/spectral_cluster.JOB.log \
+    python diarization/spec_clust.py $reco2num_spk_opts \
+      scp:$dir/scores.JOB.scp ark,t:$sdata/JOB/spk2utt ark,t:$dir/labels.JOB || exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: combining labels"
+  for j in $(seq $nj); do cat $dir/labels.$j; done > $dir/labels || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: computing RTTM"
+  diarization/make_rttm.py --rttm-channel $rttm_channel $srcdir/segments $dir/labels $dir/rttm || exit 1;
+fi
+
+if $cleanup ; then
+  rm -r $dir/tmp || exit 1;
+fi
diff --git a/egs/callhome_diarization/v1/diarization/score_cossim.sh b/egs/callhome_diarization/v1/diarization/score_cossim.sh
new file mode 100755
index 00000000000..6b37d6a5d1a
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/score_cossim.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+# Copyright  2016-2018  David Snyder
+#            2017-2018  Matthew Maciejewski
+#                 2020  Maxim Korenevsky (STC-innovations Ltd)
+# Apache 2.0.
+
+# This script is a modified version of diarization/score_plda.sh
+# that replaces i-vectors with x-vectors.
+#
+# This script computes cosine similarity scores from pairs of normalized x-vectors extracted
+# from segments of a recording.  These scores are in the form of
+# affinity matrices, one for each recording.  Most likely, the x-vectors
+# were computed using diarization/nnet3/xvector/extract_xvectors.sh.
+# The affinity matrices are most likely going to be clustered using
+# diarization/scluster.sh.
+
+# Begin configuration section.
+cmd="run.pl"
+stage=0
+target_energy=0.1
+nj=10
+cleanup=true
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <xvector-dir> <output-dir>"
+  echo " e.g.: $0 exp/xvectors_callhome_heldout exp/xvectors_callhome_test exp/xvectors_callhome_test"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --nj <n|10>                                      # Number of jobs (also see num-processes and num-threads)"
+  echo "  --stage <stage|0>                                # To control partial reruns"
+  echo "  --cleanup <bool|false>                           # If true, remove temporary files"
+  exit 1;
+fi
+
+xvecdir=$1
+dir=$2
+
+mkdir -p $dir/tmp
+
+for f in $xvecdir/xvector.scp $xvecdir/spk2utt $xvecdir/utt2spk $xvecdir/segments; do
+  [ ! -f $f ] && echo "No such file $f" && exit 1;
+done
+cp $xvecdir/xvector.scp $dir/tmp/feats.scp
+cp $xvecdir/spk2utt $dir/tmp/
+cp $xvecdir/utt2spk $dir/tmp/
+cp $xvecdir/segments $dir/tmp/
+cp $xvecdir/spk2utt $dir/
+cp $xvecdir/utt2spk $dir/
+cp $xvecdir/segments $dir/
+
+utils/fix_data_dir.sh $dir/tmp > /dev/null
+
+sdata=$dir/tmp/split$nj;
+utils/split_data.sh $dir/tmp $nj || exit 1;
+
+# Set various variables.
+mkdir -p $dir/log
+
+feats="scp:$sdata/JOB/feats.scp"
+if [ $stage -le 0 ]; then
+  echo "$0: scoring xvectors"
+  $cmd JOB=1:$nj $dir/log/cossim_scoring.JOB.log \
+      python diarization/calc_cossim_scores.py \
+      ark:$sdata/JOB/spk2utt "$feats" ark,scp:$dir/scores.JOB.ark,$dir/scores.JOB.scp || exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: combining cosine similarity scores across jobs"
+  for j in $(seq $nj); do cat $dir/scores.$j.scp; done >$dir/scores.scp || exit 1;
+fi
+
+if $cleanup ; then
+  rm -rf $dir/tmp || exit 1;
+fi
diff --git a/egs/callhome_diarization/v1/diarization/spec_clust.py b/egs/callhome_diarization/v1/diarization/spec_clust.py
new file mode 100644
index 00000000000..2bf771c6132
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/spec_clust.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+
+# Copyright  2020  Maxim Korenevsky (STC-innovations Ltd)
+# Apache 2.0.
+
+import argparse
+import os
+import numpy as np
+from sklearn.cluster import k_means
+from kaldiio import ReadHelper, WriteHelper
+import scipy
+from sklearn.cluster import SpectralClustering
+
+'''
+   Spectral Clustering based on binarization and automatic thresholding
+   Paper: T.Park, K.Han, M.Kumar, and S.Narayanan, Auto-tuning spectral clustering for speaker diarization using normalized maximumeigengap, IEEE Signal Processing Letters, vol. 27, pp. 381-385,2019
+'''
+
+#   Input-output routines
+
+def LoadAffinityMatrix(file):
+    Matrices=dict()
+    with ReadHelper(file) as reader:
+        for key, np_arr in reader:
+            Matrices[key] = np_arr
+    return Matrices
+
+def LoadReco2Utt(file):
+    if ':' in file:
+        file = file.split(':')[1]
+    IDs=dict()
+    with open(file,'r') as f:
+        for line in f:
+            ids = line.strip().split()
+            IDs[ids[0]] = ids[1:]
+    return IDs
+
+
+def LoadReco2NumSpk(file):
+    if ':' in file:
+        file = file.split(':')[1]
+    NumSpk=dict()
+    with open(file,'r') as f:
+        for line in f:
+            ids = line.strip().split()
+            NumSpk[ids[0]] = int(ids[1])
+    return NumSpk
+
+def SaveLabels(IDs, labels, file):
+    if ':' in file:
+        file = file.split(':')[1]
+    with open(file,'w') as f:
+        for id in IDs:
+            for i in range(len(IDs[id])):
+                f.write('{} {}\n'.format(IDs[id][i], labels[id][i]+1))
+
+#   NME low-level operations
+
+# Prepares binarized(0/1) affinity matrix with p_neighbors non-zero elements in each row
+def get_kneighbors_conn(X_dist, p_neighbors):
+    X_dist_out = np.zeros_like(X_dist)
+    for i, line in enumerate(X_dist):
+        sorted_idx = np.argsort(line)
+        sorted_idx = sorted_idx[::-1]
+        indices = sorted_idx[:p_neighbors]
+        X_dist_out[indices, i] = 1
+    return X_dist_out
+
+# Thresolds affinity matrix to leave p maximum non-zero elements in each row
+def Threshold(A, p):
+    N = A.shape[0]
+    Ap = np.zeros((N,N))
+    for i in range(N):
+        thr = sorted(A[i,:], reverse=True)[p]
+        Ap[i,A[i,:]>thr] = A[i,A[i,:]>thr]
+    return Ap
+
+# Computes Laplacian of a matrix
+def Laplacian(A):
+    d = np.sum(A, axis=1)-np.diag(A)
+    D = np.diag(d)
+    return D - A
+
+# Calculates eigengaps (differences between adjacent eigenvalues sorted in descending order)
+def Eigengap(S):
+    S = sorted(S)
+    return np.diff(S)
+
+# Computes parameters of normalized eigenmaps for automatic thresholding selection
+def ComputeNMEParameters(A, p, max_num_clusters):
+    # p-Neighbour binarization
+    Ap = get_kneighbors_conn(A, p)
+    # Symmetrization
+    Ap = (Ap + np.transpose(Ap))/2
+    # Laplacian matrix computation
+    Lp = Laplacian(Ap)
+    N = Lp.shape[0]
+    # EigenValue Decomposition
+    # Get max_num_clusters+1 lowest eigenvalues sorted in ascending order
+    S, _ = scipy.sparse.linalg.eigsh(-Lp, k=max_num_clusters+1, which='LA')
+    S = -S[::-1]
+    # Get largest eigenvalue
+    Smax, _ = scipy.sparse.linalg.eigsh(Lp, k=1, which='LA')
+    # Eigengap computation
+    e = Eigengap(S)
+#    g = np.max(e[:max_num_clusters])/(np.max(S)+1e-10)
+    g = np.max(e[:max_num_clusters])/(Smax[0]+1e-10)
+    r = p/g
+    k = np.argmax(e[:max_num_clusters])
+    return (e, g, k, r)
+
+
+'''
+Performs spectral clustering with Normalized Maximum Eigengap (NME)
+Parameters:
+   A: affinity matrix (matrix of pairwise cosine similarities or PLDA scores between speaker embeddings)
+   num_clusters: number of clusters to generate (if None, determined automatically)
+   max_num_clusters: maximum allowed number of clusters to generate
+   pmax: maximum count for matrix binarization (should be at least 2)
+   pbest: best count for matrix binarization (if 0, determined automatically)
+Returns: cluster assignments for every speaker embedding   
+'''
+def NME_SpectralClustering(A, num_clusters = None, max_num_clusters = 10, pbest = 0, pmax = 20):
+    if pbest==0:
+        print('Selecting best number of neighbors for affinity matrix thresolding:')
+        rbest = None
+        kbest = None
+        for p in range(2, pmax+1):
+            e, g, k, r = ComputeNMEParameters(A, p, max_num_clusters)
+            print('p={}, r={}'.format(p,r))
+            if rbest is None or rbest > r:
+                rbest = r
+                pbest = p
+                kbest = k
+        print('Best number of neighbors is {}'.format(pbest))
+        return NME_SpectralClustering_sklearn(A, num_clusters if num_clusters is not None else (kbest+1), pbest)
+    if num_clusters is None:
+        print('Compute number of clusters to generate:')
+        e, g, r, k = ComputeNMEParameters(A, p)
+        print('Number of clusters to generate is {}'.format(k+1))
+        return NME_SpectralClustering_sklearn(A, k+1, pbest)
+    return NME_SpectralClustering_sklearn(A, num_clusters, pbest)
+
+'''
+Performs spectral clustering with Normalized Maximum Eigengap (NME) with fixed threshold and number of clusters
+Parameters:
+   A: affinity matrix (matrix of pairwise cosine similarities or PLDA scores between speaker embeddings)
+   num_clusters: number of clusters to generate
+   pbest: best count for matrix binarization
+Returns: cluster assignments for every speaker embedding   
+'''
+def NME_SpectralClustering_sklearn(A, num_clusters, pbest):
+    Ap = Threshold(A, pbest)
+    Ap = (Ap + np.transpose(Ap)) / 2
+    model = SpectralClustering(n_clusters = num_clusters, affinity='precomputed', random_state=0)
+    labels = model.fit_predict(Ap)
+    return labels
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Usage: spec_clust.py [options] <scores-rspec> <reco2utt-rspec> <labels-wspec>\n' +
+                                                 'Performs spectral clustering of xvectors according to pairwise similarity scores\n' +
+                                                 'Auto-selects binarization threshold')
+    parser.add_argument('simmat_rspec', type=str, help='Kaldi-style rspecifier of similarity scores matrices to read')
+    parser.add_argument('reco2utt_rspec', type=str, help='Kaldi-style rspecifier of recording-to-utterances correspondence')
+    parser.add_argument('labels_wspec', type=str, help='Kaldi-style wspecifier to save xvector cluster labels')
+    parser.add_argument('--max_neighbors', type=int, default=20, help='Maximum number of neighbors to threshold similarity matrix')
+    parser.add_argument('--reco2num_spk', type=str, default='', help='Kaldi-style rspecifier of recording-to-numofspeakers correspondence')
+    parser.add_argument('--num_clusters', type=int, default=None, help='Number of clusters to generate. Ignored if --reco2num_spk is given')
+    args = parser.parse_args()
+
+    assert args.max_neighbors > 1, 'Maximum number of neighpors should be at least 2, {} passed\n'.format(args.max_neighbors)
+
+    print('Spectral clustering of xvector according to precomputed similarity scores matrix')
+    print('Parameters:')
+    print('Similarity matrix rspecifier: {}'.format(args.simmat_rspec))
+    print('Reco2Utt rspecifier: {}'.format(args.reco2utt_rspec))
+    print('Labels wspecifier: {}'.format(args.labels_wspec))
+    print('Number of clusters to generate: {}'.format(args.num_clusters))
+    print('Maximum number of nighbors to threshold similarity matrix: {}\n'.format(args.max_neighbors))
+    print('Reco2NumSpk rspecifier: {}'.format(args.reco2num_spk))
+
+    print('Loading affinity matrices...', end='')
+    Matrices = LoadAffinityMatrix(args.simmat_rspec)
+    print('done')
+    print('Loading Reco2Utt correspondence...', end='')
+    IDs = LoadReco2Utt(args.reco2utt_rspec)
+    print('done')
+
+    if args.reco2num_spk != '':
+        NumSpk = LoadReco2NumSpk(args.reco2num_spk)
+
+    Labels = dict()
+    for id in IDs:
+        A = Matrices[id]
+        IDList = IDs[id]
+
+        num_clusters = args.num_clusters if args.reco2num_spk == '' else NumSpk[id]
+        assert num_clusters is None or num_clusters > 0, 'Positive number of clusters expected for {}, {} found\n'.format(id, num_clusters)
+
+        print('Start clustering for recording {}...'.format(id))
+        Labels[id] = NME_SpectralClustering(A, num_clusters = num_clusters, pmax = args.max_neighbors)
+        print('Clustering done')
+    print( 'Saving labels...')
+    SaveLabels(IDs, Labels, args.labels_wspec)
+    print('done')
diff --git a/egs/chime6/s5b_track2/RESULTS b/egs/chime6/s5b_track2/RESULTS
new file mode 100644
index 00000000000..da0078e7122
--- /dev/null
+++ b/egs/chime6/s5b_track2/RESULTS
@@ -0,0 +1,37 @@
+# Results for Chime-6 track 2 for dev and eval, using pretrained models
+# available at http://kaldi-asr.org/models/m12.
+
+# These results are reported only for array U06, which is the default
+# array selection method in the baseline system.
+
+# Speech Activity Detection (SAD)
+                  Missed speech   False alarm   Total error
+Dev (old RTTM)        2.5             0.8           3.3
+Dev (new RTTM)        1.9             0.7           2.6
+Eval (old RTTM)       4.1             1.8           5.9
+Eval (new RTTM)       4.3             1.5           5.8
+
+# Diarization (x-vectors + AHC)
+                    DER      JER
+Dev (old RTTM)    61.56     69.75
+Dev (new RTTM)    63.42     70.83
+Eval (old RTTM)   61.96     71.40
+Eval (new RTTM)   68.20     72.54
+
+# Diarization (x-vectors + Spectral Clustering), new RTTM
+                    DER      JER
+Dev               59.03     61.94
+Eval              64.67     63.36
+
+# Diarization (3 iterations of TS-VAD), new RTTM 
+                    DER      JER
+Dev it1           48.82     55.52
+Dev it2           46.16     51.90
+Dev it3           44.88     50.48
+Eval it1          46.08     51.23
+Eval it2          42.90     47.24
+Eval it3          42.08     46.48
+
+# ASR nnet3 tdnn+chain (GSS on TS-VAD segments)
+Dev:  %WER 66.33 [ 39055 / 58881, 2641 ins, 20923 del, 15491 sub ]
+Eval: %WER 60.03 [ 33098 / 55132, 1298 ins, 19428 del, 12372 sub ]
diff --git a/egs/chime6/s5b_track2/cmd.sh b/egs/chime6/s5b_track2/cmd.sh
new file mode 100644
index 00000000000..86514d94d4d
--- /dev/null
+++ b/egs/chime6/s5b_track2/cmd.sh
@@ -0,0 +1,14 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="retry.pl queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
diff --git a/egs/chime6/s5b_track2/conf/beamformit.cfg b/egs/chime6/s5b_track2/conf/beamformit.cfg
new file mode 100755
index 00000000000..70fdd858651
--- /dev/null
+++ b/egs/chime6/s5b_track2/conf/beamformit.cfg
@@ -0,0 +1,50 @@
+#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)
+
+# scrolling size to compute the delays
+scroll_size = 250
+
+# cross correlation computation window size
+window_size = 500
+
+#amount of maximum points for the xcorrelation taken into account
+nbest_amount = 4
+
+#flag wether to apply an automatic noise thresholding 
+do_noise_threshold = 1
+
+#Percentage of frames with lower xcorr taken as noisy
+noise_percent = 10
+
+######## acoustic modelling parameters
+
+#transition probabilities weight for multichannel decoding
+trans_weight_multi = 25
+trans_weight_nbest = 25
+
+###
+
+#flag wether to print the feaures after setting them, or not
+print_features = 1
+
+#flag wether to use the bad frames in the sum process
+do_avoid_bad_frames = 1
+
+#flag to use the best channel (SNR) as a reference
+#defined from command line
+do_compute_reference = 1
+
+#flag wether to use a uem file or not(process all the file)
+do_use_uem_file = 0
+
+#flag wether to use an adaptative weights scheme or fixed weights
+do_adapt_weights = 1
+
+#flag wether to output the sph files or just run the system to create the auxiliary files
+do_write_sph_files = 1
+
+####directories where to store/retrieve info####
+#channels_file = ./cfg-files/channels
+
+#show needs to be passed as argument normally, here a default one is given just in case
+#show_id = Ttmp
+
diff --git a/egs/chime6/s5b_track2/conf/mfcc.conf b/egs/chime6/s5b_track2/conf/mfcc.conf
new file mode 100644
index 00000000000..32988403b00
--- /dev/null
+++ b/egs/chime6/s5b_track2/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false
+--sample-frequency=16000
diff --git a/egs/chime6/s5b_track2/conf/mfcc_hires.conf b/egs/chime6/s5b_track2/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..fd64b62eb16
--- /dev/null
+++ b/egs/chime6/s5b_track2/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 
+--num-mel-bins=40
+--num-ceps=40
+--low-freq=40
+--high-freq=-400
diff --git a/egs/chime6/s5b_track2/conf/online_cmvn.conf b/egs/chime6/s5b_track2/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/chime6/s5b_track2/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/chime6/s5b_track2/conf/sad.conf b/egs/chime6/s5b_track2/conf/sad.conf
new file mode 100644
index 00000000000..752bb1cf6c5
--- /dev/null
+++ b/egs/chime6/s5b_track2/conf/sad.conf
@@ -0,0 +1,2 @@
+affix=_1a
+nnet_type=stats
diff --git a/egs/chime6/s5b_track2/diarization b/egs/chime6/s5b_track2/diarization
new file mode 120000
index 00000000000..bad937c1444
--- /dev/null
+++ b/egs/chime6/s5b_track2/diarization
@@ -0,0 +1 @@
+../../callhome_diarization/v1/diarization
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/chain b/egs/chime6/s5b_track2/local/chain
new file mode 120000
index 00000000000..dd7910711d1
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/chain
@@ -0,0 +1 @@
+../../s5_track1/local/chain/
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/check_dset_error.py b/egs/chime6/s5b_track2/local/check_dset_error.py
new file mode 100755
index 00000000000..0ed7f59ae83
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/check_dset_error.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+# Copyright   2019   Ashish Arora
+# Apache 2.0.
+
+import argparse
+import sys, os
+import string
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="""This script splits a kaldi text file
+        into per_speaker per_session text files""")
+    parser.add_argument("wer_dir_path", type=str,
+                        help="path of directory containing wer files")
+    parser.add_argument("output_dir_path", type=str,
+                        help="path of the directory containing per speaker output files")
+    args = parser.parse_args()
+    return args
+
+def get_results(filename):
+    with open(filename) as f:
+        first_line = f.readline()
+        parts = first_line.strip().split(',')
+        total_words = parts[0].split()[-1]
+        ins = parts[1].split()[0]
+        deletion = parts[2].split()[0]
+        sub = parts[3].split()[0]
+        return int(total_words), int(ins), int(deletion), int(sub)
+
+def main():
+    args = get_args()
+    recodingid_error_dict={}
+    min_wer_per_recording = os.path.join(args.wer_dir_path, 'all.txt')
+    for line in open(min_wer_per_recording, 'r', encoding='utf8'):
+        toks = line.strip().split()
+        recordingid = toks[1]
+        total_words = toks[-5][:-1]
+        total_errors = toks[-4][:-1]
+        total_ins = toks[-3][:-1]
+        total_del = toks[-2][:-1]
+        total_sub = toks[-1]
+        recodingid_error_dict[recordingid]=(total_words, total_errors, total_ins, total_del, total_sub)
+    
+    recording_spkorder_file = os.path.join(args.output_dir_path, 'recordinid_spkorder')
+    for line in open(recording_spkorder_file, 'r', encoding='utf8'):
+        parts = line.strip().split(':')
+        recordingid = parts[0]
+        spkorder = parts[1]
+        spkorder_list=spkorder.split('_')
+        num_speakers=len(spkorder_list)
+        total_errors=total_words=total_ins=total_del=total_sub=0    
+        for i in range(1, num_speakers+1):
+            filename = 'wer_' + recordingid + '_' + 'r' + str(i)+ 'h' + str(spkorder_list[i-1])
+            wer_filename = os.path.join(args.wer_dir_path, filename)
+            words, ins, deletion, sub = get_results(wer_filename)
+            total_words += words
+            total_ins += ins
+            total_del += deletion
+            total_sub += sub
+            total_errors += ins + deletion + sub
+        assert int(total_words) == int(recodingid_error_dict[recordingid][0]), "Total words mismatch"
+        assert int(total_errors) == int(recodingid_error_dict[recordingid][1]), "Total errors mismatch"
+        assert int(total_ins) == int(recodingid_error_dict[recordingid][2]), "Total insertions mismatch"
+        assert int(total_del) == int(recodingid_error_dict[recordingid][3]), "Total deletions mismatch"
+        assert int(total_sub) == int(recodingid_error_dict[recordingid][4]), "Total substitutions mismatch"
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/chime6/s5b_track2/local/check_tools.sh b/egs/chime6/s5b_track2/local/check_tools.sh
new file mode 120000
index 00000000000..4e835e887f2
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/check_tools.sh
@@ -0,0 +1 @@
+../../s5_track1/local/check_tools.sh
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/convert_rttm_to_utt2spk_and_segments.py b/egs/chime6/s5b_track2/local/convert_rttm_to_utt2spk_and_segments.py
new file mode 100755
index 00000000000..410dced190c
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/convert_rttm_to_utt2spk_and_segments.py
@@ -0,0 +1,98 @@
+#! /usr/bin/env python
+# Copyright   2019   Vimal Manohar
+# Apache 2.0.
+
+"""This script converts an RTTM with
+speaker info into kaldi utt2spk and segments"""
+
+import argparse
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="""This script converts an RTTM with
+        speaker info into kaldi utt2spk and segments""")
+    parser.add_argument("--use-reco-id-as-spkr", type=str,
+                        choices=["true", "false"], default="false",
+                        help="Use the recording ID based on RTTM and "
+                        "reco2file_and_channel as the speaker")
+    parser.add_argument("--append-reco-id-to-spkr", type=str,
+                        choices=["true", "false"], default="false",
+                        help="Append recording ID to the speaker ID")
+
+    parser.add_argument("rttm_file", type=str,
+                        help="""Input RTTM file.
+                        The format of the RTTM file is
+                        <type> <file-id> <channel-id> <begin-time> """
+                        """<end-time> <NA> <NA> <speaker> <conf>""")
+    parser.add_argument("reco2file_and_channel", type=str,
+                        help="""Input reco2file_and_channel.
+                        The format is <recording-id> <file-id> <channel-id>.""")
+    parser.add_argument("utt2spk", type=str,
+                        help="Output utt2spk file")
+    parser.add_argument("segments", type=str,
+                        help="Output segments file")
+
+    args = parser.parse_args()
+
+    args.use_reco_id_as_spkr = bool(args.use_reco_id_as_spkr == "true")
+    args.append_reco_id_to_spkr = bool(args.append_reco_id_to_spkr == "true")
+
+    if args.use_reco_id_as_spkr:
+        if args.append_reco_id_to_spkr:
+            raise Exception("Appending recording ID to speaker does not make sense when using --use-reco-id-as-spkr=true")
+
+    return args
+
+def main():
+    args = get_args()
+
+    file_and_channel2reco = {}
+    utt2spk={}
+    segments={}
+    for line in open(args.reco2file_and_channel):
+        parts = line.strip().split()
+        file_and_channel2reco[(parts[1], parts[2])] = parts[0]
+
+    utt2spk_writer = open(args.utt2spk, 'w')
+    segments_writer = open(args.segments, 'w')
+    for line in open(args.rttm_file):
+        parts = line.strip().split()
+        if parts[0] != "SPEAKER":
+            continue
+
+        file_id = parts[1]
+        channel = parts[2]
+
+        try:
+            reco = file_and_channel2reco[(file_id, channel)]
+        except KeyError as e:
+            raise Exception("Could not find recording with "
+                            "(file_id, channel) "
+                            "= ({0},{1}) in {2}: {3}\n".format(
+                                file_id, channel,
+                                args.reco2file_and_channel, str(e)))
+
+        start_time = float(parts[3])
+        end_time = start_time + float(parts[4])
+
+        if args.use_reco_id_as_spkr:
+            spkr = reco
+        else:
+            if args.append_reco_id_to_spkr:
+                spkr = reco + "-" + parts[7]
+            else:
+                spkr = parts[7]
+
+        st = int(start_time * 100)
+        end = int(end_time * 100)
+        utt = "{0}-{1:06d}-{2:06d}".format(spkr, st, end)
+        utt2spk[utt]=spkr
+        segments[utt]=(reco, start_time, end_time)
+
+    for uttid_id in sorted(utt2spk):
+        utt2spk_writer.write("{0} {1}\n".format(uttid_id, utt2spk[uttid_id]))
+        segments_writer.write("{0} {1} {2:7.2f} {3:7.2f}\n".format(
+            uttid_id, segments[uttid_id][0], segments[uttid_id][1], segments[uttid_id][2]))
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/chime6/s5b_track2/local/copy_lat_dir_parallel.sh b/egs/chime6/s5b_track2/local/copy_lat_dir_parallel.sh
new file mode 120000
index 00000000000..a168a917d92
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/copy_lat_dir_parallel.sh
@@ -0,0 +1 @@
+../../s5_track1/local/copy_lat_dir_parallel.sh
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/decode_ts-vad.sh b/egs/chime6/s5b_track2/local/decode_ts-vad.sh
new file mode 100755
index 00000000000..5ed51e8962f
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/decode_ts-vad.sh
@@ -0,0 +1,361 @@
+#!/usr/bin/env bash
+#
+# This script decodes raw utterances through the entire pipeline:
+# Feature extraction -> SAD -> Diarization -> TS-VAD diarization -> GSS enhancement -> ASR
+#
+# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal)
+#            2019  Desh Raj, David Snyder, Ashish Arora, Zhaoheng Ni
+#            2020  Ivan Medennikov, Tatyana Prisyach, Maxim Korenevsky (STC-innovations Ltd)
+# Apache 2.0
+
+# Begin configuration section.
+nj=8
+stage=0
+sad_stage=0
+score_sad=true
+diarizer_stage=0
+score_stage=0
+ts_vad_num_iters=3
+
+enhancement=beamformit
+
+# option to use the new RTTM reference for sad and diarization
+use_new_rttm_reference=true
+if $use_new_rttm_reference == "true" ; then
+  git clone https://github.com/nateanl/chime6_rttm
+fi
+
+# chime5 main directory path
+# please change the path accordingly
+chime5_corpus=/export/corpora4/CHiME5
+# chime6 data directories, which are generated from ${chime5_corpus},
+# to synchronize audio files across arrays and modify the annotation (JSON) file accordingly
+chime6_corpus=${PWD}/CHiME6
+json_dir=${chime6_corpus}/transcriptions
+audio_dir=${chime6_corpus}/audio
+
+enhanced_dir=enhanced
+enhanced_dir=$(utils/make_absolute.sh $enhanced_dir) || exit 1
+
+# training data
+train_set=train_worn_simu_u400k
+test_sets="dev_${enhancement}_dereverb eval_${enhancement}_dereverb"
+
+# ts-vad
+ts_vad_dir=exp/ts-vad_b
+ivector_dir=exp/nnet3_b
+ups=18
+
+#spectral clustering
+daffix=
+use_sc=true
+
+# gss
+final_gss=true
+gss_nj=40
+bss_iterations=5
+context_samples=160000
+
+#number of microphones to perform GSS: outer_array_mics (CH1 and CH4 of each Kinect) or True (all microphones)
+multiarray=outer_array_mics
+
+#GSS activities: hard (standard binary activities) or soft (TS-VAD derived activities, not implemented yet)
+gss_type=hard
+
+. ./utils/parse_options.sh
+
+. ./cmd.sh
+. ./path.sh
+. ./conf/sad.conf
+
+$use_sc && daffix="_sc"
+pref_enhan=_${multiarray}_${context_samples}_${bss_iterations}it
+
+# This script also needs the phonetisaurus g2p, srilm, beamformit
+./local/check_tools.sh || exit 1
+
+###########################################################################
+# We first generate the synchronized audio files across arrays and
+# corresponding JSON files. Note that this requires sox v14.4.2,
+# which is installed via miniconda in ./local/check_tools.sh
+###########################################################################
+
+if [ $stage -le 0 ]; then
+  local/generate_chime6_data.sh \
+    --cmd "$train_cmd" \
+    ${chime5_corpus} \
+    ${chime6_corpus}
+fi
+
+#######################################################################
+# Prepare the dev and eval data with dereverberation (WPE) and
+# beamforming.
+#######################################################################
+if [ $stage -le 1 ]; then
+  # Beamforming using reference arrays
+  # enhanced WAV directory
+  enhandir=enhan
+  dereverb_dir=${PWD}/wav/wpe/
+
+  for dset in dev eval; do
+    for mictype in u01 u02 u03 u04 u06; do
+      local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 20G" \
+            ${audio_dir}/${dset} \
+            ${dereverb_dir}/${dset} \
+            ${mictype}
+    done
+  done
+
+  for dset in dev eval; do
+    for mictype in u01 u02 u03 u04 u06; do
+      local/run_beamformit.sh --cmd "$train_cmd" \
+        ${dereverb_dir}/${dset} \
+        ${enhandir}/${dset}_${enhancement}_${mictype} \
+        ${mictype}
+    done
+  done
+
+  # Note that for the evaluation sets, we use the flag
+  # "--train false". This keeps the files segments, text,
+  # and utt2spk with .bak extensions, so that they can
+  # be used later for scoring if needed but are not used
+  # in the intermediate stages.
+  for dset in dev eval; do
+    local/prepare_data.sh --mictype ref --train false \
+      "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \
+      ${json_dir}/${dset} data/${dset}_${enhancement}_dereverb
+  done
+
+fi
+
+if [ $stage -le 2 ]; then
+  # mfccdir should be some place with a largish disk where you
+  # want to store MFCC features.
+  mfccdir=mfcc
+  for x in ${test_sets}; do
+    steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \
+      --mfcc-config conf/mfcc_hires.conf \
+      data/$x exp/make_mfcc/$x $mfccdir
+  done
+fi
+
+#######################################################################
+# Perform SAD on the dev/eval data
+#######################################################################
+dir=exp/segmentation${affix}
+sad_work_dir=exp/sad${affix}_${nnet_type}/
+sad_nnet_dir=$dir/tdnn_${nnet_type}_sad_1a
+
+if [ $stage -le 3 ]; then
+  for datadir in ${test_sets}; do
+    test_set=data/${datadir}
+    if [ ! -f ${test_set}/wav.scp ]; then
+      echo "$0: Not performing SAD on ${test_set}"
+      exit 0
+    fi
+    # Perform segmentation
+    local/segmentation/detect_speech_activity.sh --nj $nj --stage $sad_stage \
+      $test_set $sad_nnet_dir mfcc $sad_work_dir \
+      data/${datadir} || exit 1
+
+    test_dir=data/${datadir}_${nnet_type}_seg
+    mv data/${datadir}_seg ${test_dir}/
+    cp data/${datadir}/{segments.bak,utt2spk.bak} ${test_dir}/
+    # Generate RTTM file from segmentation performed by SAD. This can
+    # be used to evaluate the performance of the SAD as an intermediate
+    # step.
+    steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \
+      ${test_dir}/utt2spk ${test_dir}/segments ${test_dir}/rttm
+
+    if [ $score_sad == "true" ]; then
+      echo "Scoring $datadir.."
+      # We first generate the reference RTTM from the backed up utt2spk and segments
+      # files.
+      ref_rttm=${test_dir}/ref_rttm
+      steps/segmentation/convert_utt2spk_and_segments_to_rttm.py ${test_dir}/utt2spk.bak \
+        ${test_dir}/segments.bak ${test_dir}/ref_rttm
+
+      # To score, we select just U06 segments from the hypothesis RTTM.
+      hyp_rttm=${test_dir}/rttm.U06
+      grep 'U06' ${test_dir}/rttm > ${test_dir}/rttm.U06
+      echo "Array U06 selected for scoring.."
+
+      if $use_new_rttm_reference == "true"; then
+        echo "Use the new RTTM reference."
+        mode="$(cut -d'_' -f1 <<<"$datadir")"
+        ref_rttm=./chime6_rttm/${mode}_rttm
+      fi
+
+      sed 's/_U0[1-6].ENH//g' $ref_rttm > $ref_rttm.scoring
+      sed 's/_U0[1-6].ENH//g' $hyp_rttm > $hyp_rttm.scoring
+      cat ./local/uem_file | grep 'U06' | sed 's/_U0[1-6]//g' > ./local/uem_file.tmp
+      md-eval.pl -1 -c 0.25 -u ./local/uem_file.tmp -r $ref_rttm.scoring -s $hyp_rttm.scoring |\
+        awk 'or(/MISSED SPEECH/,/FALARM SPEECH/)'
+    fi
+  done
+fi
+
+#######################################################################
+# Perform diarization on the dev/eval data
+#######################################################################
+if [ $stage -le 4 ]; then
+  for datadir in ${test_sets}; do
+    if $use_new_rttm_reference == "true"; then
+      mode="$(cut -d'_' -f1 <<<"$datadir")"
+      ref_rttm=./chime6_rttm/${mode}_rttm
+    else
+      ref_rttm=data/${datadir}_${nnet_type}_seg/ref_rttm
+    fi
+    local/diarize${daffix}.sh --nj $nj --cmd "$train_cmd" --stage $diarizer_stage \
+      --ref-rttm $ref_rttm \
+      exp/xvector_nnet_1a \
+      data/${datadir}_${nnet_type}_seg \
+      exp/${datadir}_${nnet_type}_seg_diarization
+  done
+fi
+
+#######################################################################
+# Perform TS-VAD diarization on the dev/eval data
+#######################################################################
+if [ $stage -le 5 ]; then
+  for datadir in ${test_sets}; do
+    mode="$(cut -d'_' -f1 <<<"$datadir")"
+    if $use_new_rttm_reference == "true"; then
+      ref_rttm=./chime6_rttm/${mode}_rttm
+    else
+      ref_rttm=data/${datadir}_${nnet_type}_seg/ref_rttm
+    fi
+
+    [ ! -f data/${datadir}_diarized_hires/feats.scp ] && \
+      local/prepare_diarized_data.sh --cmd "$train_cmd" \
+      exp/${datadir}_${nnet_type}_seg_diarization \
+      data/$datadir data/${datadir}_diarized
+
+    # 1st iteration
+    it=1
+    ivector_affix=baseline-init
+    local/ts-vad/diarize_TS-VAD_it1.sh --cmd "$train_cmd" \
+      --ref-rttm $ref_rttm \
+      --ivector-affix $ivector_affix \
+      --thr 0.4 \
+      $ts_vad_dir $ivector_dir ${datadir}_diarized \
+      $ts_vad_dir/it${it}_${ivector_affix} || exit 1
+
+    initdir=$ts_vad_dir/it${it}_${ivector_affix}/${datadir}_U06_hires_split10000
+    # 2nd and further iterations
+    while [ $it -lt $ts_vad_num_iters ]; do
+      ivector_affix=it${it}-init
+      it=$((it+1))
+      mt=0.5
+      t=0.5
+      [ $it == "2" ] && mt=0 && t=0.5
+      local/ts-vad/diarize_TS-VAD_it2.sh --cmd "$train_cmd" \
+        --ups $ups \
+        --ref-rttm $ref_rttm \
+        --it $it \
+        --ivector-affix $ivector_affix \
+        --channels "CH1 CH2 CH3 CH4" \
+        --audio_dir $audio_dir \
+        --mt $mt \
+        --t $t \
+        --thr 0.4 \
+        $ts_vad_dir $ivector_dir $initdir \
+        $ts_vad_dir/it${it}_${ivector_affix} || exit 1
+      initdir=$ts_vad_dir/it${it}_${ivector_affix}/${mode}_20ch-AVG_hires_split10000_${ups}ups
+    done
+
+    if [ ! -f data/${datadir}_ts-vad-it${ts_vad_num_iters}-diarized_hires/feats.scp ]; then
+      cat $initdir/scoring/rttm | awk '{$2=$2"_U06"; print $0}' > $initdir/rttm
+      local/prepare_diarized_data.sh --cmd "$train_cmd" \
+      $initdir data/$datadir data/${datadir}_ts-vad-it${ts_vad_num_iters}-diarized || exit 1
+    fi
+  done
+fi
+
+#######################################################################
+# GSS on top of TS-VAD diarized segments
+#######################################################################
+if [ $stage -le 6 ]; then
+  if $final_gss; then
+    if [ ! -d pb_chime5/ ]; then
+      local/install_pb_chime5.sh
+    fi
+    echo "$0:  enhance data..."
+    # Guided Source Separation (GSS) from Paderborn University
+    # http://spandh.dcs.shef.ac.uk/chime_workshop/papers/CHiME_2018_paper_boeddecker.pdf
+    # @Article{PB2018CHiME5,
+    #   author    = {Boeddeker, Christoph and Heitkaemper, Jens and Schmalenstroeer, Joerg and Drude, Lukas and Heymann, Jahn and Haeb-Umbach, Reinhold},
+    #   title     = {{Front-End Processing for the CHiME-5 Dinner Party Scenario}},
+    #   year      = {2018},
+    #   booktitle = {CHiME5 Workshop},
+    # }
+
+    miniconda_dir=$HOME/miniconda3/
+    export PATH=$miniconda_dir/bin:$PATH
+    export CHIME6_DIR=$chime6_corpus
+
+    for dset in ${test_sets}; do
+      datadir=data/${dset}_ts-vad-it${ts_vad_num_iters}-diarized
+      dset_type=`echo $dset | awk -F "_" '{print $1;}'`
+      [ ! -f ${datadir}_hires/chime6.json ] && python3 local/get_cache_chime6.py ${datadir}_hires/segments $dset_type $audio_dir/$dset_type ${datadir}_hires/chime6.json
+      [ ! -d pb_chime5/cache ] && mkdir pb_chime5/cache
+      cp -f ${datadir}_hires/chime6.json pb_chime5/cache/chime6.json
+
+      enhanced_dir=data/gss_${gss_type}${pref_enhan}_ts-vad-it${ts_vad_num_iters}-diarized
+      if [ ! -f ${enhanced_dir}/.${dset_type}.done ]; then
+        local/run_gss.sh \
+          --cmd "$train_cmd --max-jobs-run $gss_nj" --nj 512 \
+          --bss_iterations $bss_iterations \
+          --context_samples $context_samples \
+          --multiarray $multiarray \
+          ${dset_type} \
+          ${enhanced_dir} \
+          ${enhanced_dir} || exit 1
+        touch ${enhanced_dir}/.${dset_type}.done
+      fi
+
+      if [ ! -f data/${datadir}_gss_${gss_type}${pref_enhan}_hires/feats.scp ]; then
+        local/prepare_gss_data.sh ${enhanced_dir}/audio/${dset_type} ${datadir}_hires ${datadir}_gss_${gss_type}${pref_enhan}_hires
+      fi
+    done
+  fi
+fi
+
+#######################################################################
+# Decode diarized output using trained chain model
+#######################################################################
+if [ $stage -le 7 ]; then
+  for datadir in ${test_sets}; do
+    dset=data/${datadir}_ts-vad-it${ts_vad_num_iters}-diarized
+    if $final_gss; then
+      dset=${dset}_gss_${gss_type}${pref_enhan}
+    fi
+    echo "$0 performing decoding on the extracted features"
+    asr_model_dir=exp/chain_${train_set}_cleaned_rvb
+    local/nnet3/decode.sh --affix 2stage --acwt 1.0 --post-decode-acwt 10.0 \
+      --frames-per-chunk 150 --nj $nj --ivector-dir exp/nnet3_${train_set}_cleaned_rvb \
+      $dset data/lang $asr_model_dir/tree_sp/graph $asr_model_dir/tdnn1b_sp/ || exit 1
+  done
+fi
+
+#######################################################################
+# Score decoded dev/eval sets
+#######################################################################
+if [ $stage -le 8 ]; then
+  # final scoring to get the challenge result
+  # please specify both dev and eval set directories so that the search parameters
+  # (insertion penalty and language model weight) will be tuned using the dev set
+  dev_dir=dev_beamformit_dereverb_ts-vad-it${ts_vad_num_iters}-diarized
+  eval_dir=eval_beamformit_dereverb_ts-vad-it${ts_vad_num_iters}-diarized
+  if $final_gss; then
+    dev_dir=${dev_dir}_gss_${gss_type}${pref_enhan}
+    eval_dir=${eval_dir}_gss_${gss_type}${pref_enhan}
+  fi
+  local/score_for_submit.sh --stage $score_stage \
+      --dev_decodedir exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_${dev_dir}_2stage \
+      --dev_datadir ${dev_dir}_hires \
+      --eval_decodedir exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_${eval_dir}_2stage \
+      --eval_datadir ${eval_dir}_hires
+fi
+
+exit 0;
diff --git a/egs/chime6/s5b_track2/local/diarize.sh b/egs/chime6/s5b_track2/local/diarize.sh
new file mode 100755
index 00000000000..d555e92c0e8
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/diarize.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+# Copyright   2019   David Snyder
+#             2020   Desh Raj
+
+# Apache 2.0.
+#
+# This script takes an input directory that has a segments file (and
+# a feats.scp file), and performs diarization on it.  The output directory
+# contains an RTTM file which can be used to resegment the input data.
+
+stage=0
+nj=10
+cmd="run.pl"
+ref_rttm=
+
+echo "$0 $@"  # Print the command line for logging
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+if [ $# != 3 ]; then
+  echo "Usage: $0 <model-dir> <in-data-dir> <out-dir>"
+  echo "e.g.: $0 exp/xvector_nnet_1a  data/dev exp/dev_diarization"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs."
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --ref_rttm ./local/dev_rttm                      # the location of the reference RTTM file"
+  exit 1;
+fi
+
+model_dir=$1
+data_in=$2
+out_dir=$3
+
+name=`basename $data_in`
+
+for f in $data_in/feats.scp $data_in/segments $model_dir/plda \
+  $model_dir/final.raw $model_dir/extract.config; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+if [ $stage -le 0 ]; then
+  echo "$0: keeping only data corresponding to array U06 "
+  echo "$0: we can skip this stage, to perform diarization on all arrays "
+  # to perform diarization ond scoring on all array please skip this step and
+  # pass all_array = true in local/multispeaker_score.sh
+  cp -r data/$name data/${name}.bak
+  mv data/$name/wav.scp data/$name/wav.scp.bak
+  grep 'U06' data/$name/wav.scp.bak > data/$name/wav.scp
+  utils/fix_data_dir.sh data/$name
+  nj=2 # since we have reduced number of "speakers" now
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: computing features for x-vector extractor"
+  utils/fix_data_dir.sh data/${name}
+  rm -rf data/${name}_cmn
+  local/nnet3/xvector/prepare_feats.sh --nj $nj --cmd "$cmd" \
+    data/$name data/${name}_cmn exp/${name}_cmn
+  cp data/$name/segments exp/${name}_cmn/
+  utils/fix_data_dir.sh data/${name}_cmn
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: extracting x-vectors for all segments"
+  diarization/nnet3/xvector/extract_xvectors.sh --cmd "$cmd" \
+    --nj $nj --window 1.5 --period 0.75 --apply-cmn false \
+    --min-segment 0.5 $model_dir \
+    data/${name}_cmn $out_dir/xvectors_${name}
+fi
+
+# Perform PLDA scoring
+if [ $stage -le 3 ]; then
+  # Perform PLDA scoring on all pairs of segments for each recording.
+  echo "$0: performing PLDA scoring between all pairs of x-vectors"
+  diarization/nnet3/xvector/score_plda.sh --cmd "$cmd" \
+    --target-energy 0.5 \
+    --nj $nj $model_dir/ $out_dir/xvectors_${name} \
+    $out_dir/xvectors_${name}/plda_scores
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: performing clustering using PLDA scores (we assume 4 speakers per recording)"
+  awk '{print $1, "4"}' data/$name/wav.scp > data/$name/reco2num_spk
+  diarization/cluster.sh --cmd "$cmd" --nj $nj \
+    --reco2num-spk data/$name/reco2num_spk \
+    --rttm-channel 1 \
+    $out_dir/xvectors_${name}/plda_scores $out_dir
+  echo "$0: wrote RTTM to output directory ${out_dir}"
+fi
+
+hyp_rttm=${out_dir}/rttm
+
+# For scoring the diarization system, we use the same tool that was
+# used in the DIHARD II challenge. This is available at:
+# https://github.com/nryant/dscore
+# Note that the scoring takes a single reference RTTM and a single
+# hypothesis RTTM.
+if [ $stage -le 5 ]; then
+  # If a reference RTTM file is not provided, we create one using the backed up
+  # segments and utt2spk files in the original data directory.
+  if [ -z "$ref_rttm" ]; then
+    steps/segmentation/convert_utt2spk_and_segments_to_rttm.py data/$name/utt2spk.bak \
+      data/$name/segments.bak data/$name/rttm
+    ref_rttm=data/$name/rttm
+  fi
+  echo "Diarization results for "${name}
+  if ! [ -d dscore ]; then
+    git clone https://github.com/nryant/dscore.git || exit 1;
+    cd dscore
+    python -m pip install --user -r requirements.txt
+    cd ..
+  fi
+  sed 's/_U0[1-6]\.ENH//g' $ref_rttm > $ref_rttm.scoring
+  sed 's/_U0[1-6]\.ENH//g' $hyp_rttm > $hyp_rttm.scoring
+  ref_rttm_path=$(readlink -f ${ref_rttm}.scoring)
+  hyp_rttm_path=$(readlink -f ${hyp_rttm}.scoring)
+  cat ./local/uem_file | grep 'U06' | sed 's/_U0[1-6]//g' > ./local/uem_file.scoring
+  cd dscore && python score.py -u ../local/uem_file.scoring -r $ref_rttm_path \
+    -s $hyp_rttm_path && cd .. || exit 1;
+fi
diff --git a/egs/chime6/s5b_track2/local/diarize_sc.sh b/egs/chime6/s5b_track2/local/diarize_sc.sh
new file mode 100755
index 00000000000..247bead2d35
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/diarize_sc.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+# Copyright   2019   David Snyder
+#             2020   Maxim Korenevsky, Ivan Medennikov (STC-innovations Ltd)
+# Apache 2.0.
+#
+# This script takes an input directory that has a segments file (and
+# a feats.scp file), and performs spectral clustering based diarization on it. 
+# The output directory contains an RTTM file which can be used to resegment the input data.
+
+stage=0
+nj=10
+cmd="run.pl"
+ref_rttm=
+
+echo "$0 $@"  # Print the command line for logging
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+if [ $# != 3 ]; then
+  echo "Usage: $0 <model-dir> <in-data-dir> <out-dir>"
+  echo "e.g.: $0 exp/xvector_nnet_1a  data/dev exp/dev_diarization"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs."
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --ref-rttm <path to reference RTTM>              # if present, used to score output RTTM."
+  exit 1;
+fi
+
+model_dir=$1
+data_in=$2
+out_dir=$3
+
+name=`basename $data_in`
+
+for f in $data_in/feats.scp $data_in/segments $model_dir/plda \
+  $model_dir/final.raw $model_dir/extract.config; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+if [ $stage -le 0 ]; then
+  echo "$0: keeping only data corresponding to array U06 "
+  echo "$0: we can skip this stage, to perform diarization on all arrays "
+  # to perform diarization ond scoring on all array please skip this step and
+  # pass all_array = true in local/multispeaker_score.sh
+  cp -r data/$name data/${name}.bak
+  mv data/$name/wav.scp data/$name/wav.scp.bak
+  grep 'U06' data/$name/wav.scp.bak > data/$name/wav.scp
+  utils/fix_data_dir.sh data/$name
+fi
+nj=2 # since we have reduced number of "speakers" now
+
+if [ $stage -le 1 ]; then
+  echo "$0: computing features for x-vector extractor"
+  utils/fix_data_dir.sh data/${name}
+  rm -rf data/${name}_cmn
+  local/nnet3/xvector/prepare_feats.sh --nj $nj --cmd "$cmd" \
+    data/$name data/${name}_cmn exp/${name}_cmn
+  cp data/$name/segments exp/${name}_cmn/
+  utils/fix_data_dir.sh data/${name}_cmn
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: extracting x-vectors for all segments"
+  diarization/nnet3/xvector/extract_xvectors.sh --cmd "$cmd" \
+    --nj $nj --window 1.5 --period 0.75 --apply-cmn false \
+    --min-segment 0.5 $model_dir \
+    data/${name}_cmn $out_dir/xvectors_${name}
+fi
+
+# Perform cosine similarity scoring
+if [ $stage -le 3 ]; then
+  # Perform cosine similarity scoring on all pairs of segments for each recording.
+  echo "$0: performing cosine similarity scoring between all pairs of x-vectors"
+  diarization/score_cossim.sh --cmd "$cmd" \
+    --nj $nj $out_dir/xvectors_${name} \
+    $out_dir/xvectors_${name}/cossim_scores
+fi
+
+
+if [ $stage -le 4 ]; then
+  echo "$0: performing spectral clustering using cosine similarity scores (we assume 4 speakers per recording)"
+  awk '{print $1, "4"}' data/$name/wav.scp > data/$name/reco2num_spk
+  diarization/scluster.sh --cmd "$cmd" --nj $nj \
+    --reco2num-spk data/$name/reco2num_spk \
+    --rttm-channel 1 \
+    $out_dir/xvectors_${name}/cossim_scores $out_dir
+  echo "$0: wrote RTTM to output directory ${out_dir}"
+fi
+
+hyp_rttm=${out_dir}/rttm
+
+# For scoring the diarization system, we use the same tool that was
+# used in the DIHARD II challenge. This is available at:
+# https://github.com/nryant/dscore
+# Note that the scoring takes a single reference RTTM and a single
+# hypothesis RTTM.
+if [ $stage -le 5 ]; then
+  # If a reference RTTM file is not provided, we create one using the backed up
+  # segments and utt2spk files in the original data directory.
+  if [ -z "$ref_rttm" ]; then
+    steps/segmentation/convert_utt2spk_and_segments_to_rttm.py data/$name/utt2spk.bak \
+      data/$name/segments.bak data/$name/rttm
+    ref_rttm=data/$name/rttm
+  fi
+  echo "Diarization results for "${name}
+  if ! [ -d dscore ]; then
+    git clone https://github.com/nryant/dscore.git || exit 1;
+    cd dscore
+    python -m pip install --user -r requirements.txt
+    cd ..
+  fi
+  sed 's/_U0[1-6]\.ENH//g' $ref_rttm > $ref_rttm.scoring
+  sed 's/_U0[1-6]\.ENH//g' $hyp_rttm > $hyp_rttm.scoring
+  ref_rttm_path=$(readlink -f ${ref_rttm}.scoring)
+  hyp_rttm_path=$(readlink -f ${hyp_rttm}.scoring)
+  cat ./local/uem_file | grep 'U06' | sed 's/_U0[1-6]//g' > ./local/uem_file.scoring
+  cd dscore && python score.py -u ../local/uem_file.scoring -r $ref_rttm_path \
+    -s $hyp_rttm_path 2>&1 | tee -a ../${out_dir}/DER && cd .. || exit 1;
+fi
diff --git a/egs/chime6/s5b_track2/local/distant_audio_list b/egs/chime6/s5b_track2/local/distant_audio_list
new file mode 120000
index 00000000000..0455876cf4d
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/distant_audio_list
@@ -0,0 +1 @@
+../../s5_track1/local/distant_audio_list
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/extract_noises.py b/egs/chime6/s5b_track2/local/extract_noises.py
new file mode 120000
index 00000000000..04a6389916d
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/extract_noises.py
@@ -0,0 +1 @@
+../../s5_track1/local/extract_noises.py
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/extract_vad_weights.sh b/egs/chime6/s5b_track2/local/extract_vad_weights.sh
new file mode 120000
index 00000000000..0db29cded5d
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/extract_vad_weights.sh
@@ -0,0 +1 @@
+../../s5_track1/local/extract_vad_weights.sh
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/gen_aligned_hyp.py b/egs/chime6/s5b_track2/local/gen_aligned_hyp.py
new file mode 100755
index 00000000000..acaa3a13ad5
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/gen_aligned_hyp.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+# Copyright   2019   Yusuke Fujita
+# Apache 2.0.
+
+"""This script generates hypothesis utterances aligned with reference segments.
+  Usage: gen_align_hyp.py alignment.txt wc.txt > hyp.txt
+    alignment.txt is a session-level word alignment generated by align-text command.
+    wc.txt is a sequence of utt-id:reference_word_count generated by 'local/get_ref_perspeaker_persession_file.py'.
+"""
+
+import sys, io
+import string
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+def load_align_text(f):
+    alignments = {}
+    for line in f:
+        recoid, res = line.split(None, 1)
+        alignments[recoid] = []
+        toks = res.split(';')
+        for tok in toks:
+            ref, hyp = tok.split()
+            alignments[recoid].append((ref, hyp))
+    return alignments
+
+alignments = load_align_text(open(sys.argv[1],'r', encoding='utf8'))
+
+for line in open(sys.argv[2],'r', encoding='utf8'):
+    recoid, res = line.split(None, 1)
+    ali = iter(alignments[recoid])
+    toks = res.split()
+    for tok in toks:
+        uttid, count = tok.split(':')
+        count = int(count)
+        text = ''
+        for i in range(count):
+            while True:
+                ref, hyp = ali.__next__()
+                if hyp != '<eps>':
+                    text += ' ' + hyp
+                if ref != '<eps>':
+                    break
+        output.write(uttid + ' ' + text.strip() + '\n')
diff --git a/egs/chime6/s5b_track2/local/generate_chime6_data.sh b/egs/chime6/s5b_track2/local/generate_chime6_data.sh
new file mode 120000
index 00000000000..62882cd6279
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/generate_chime6_data.sh
@@ -0,0 +1 @@
+../../s5_track1/local/generate_chime6_data.sh
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/get_best_error.py b/egs/chime6/s5b_track2/local/get_best_error.py
new file mode 100755
index 00000000000..b9d8b0d43e7
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/get_best_error.py
@@ -0,0 +1,84 @@
+#! /usr/bin/env python3
+# Copyright   2019   Ashish Arora
+# Apache 2.0.
+"""This script finds best matching of reference and hypothesis speakers.
+  For the best matching speakers,it provides the WER for the reference session
+  (eg:S02) and hypothesis recording (eg: S02_U02)"""
+
+import itertools
+import numpy as np
+import argparse
+from munkres import Munkres
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="""This script finds best matching of reference and hypothesis speakers.
+  For the best matching it provides the WER""")
+    parser.add_argument("WER_dir", type=str,
+                        help="path of WER files")
+    parser.add_argument("recording_id", type=str,
+                        help="recording_id name")
+    parser.add_argument("num_speakers", type=str,
+                        help="number of speakers in ref")
+    args = parser.parse_args()
+    return args
+
+
+def get_results(filename):
+    with open(filename) as f:
+        first_line = f.readline()
+        parts = first_line.strip().split(',')
+        total_words = parts[0].split()[-1]
+        ins = parts[1].split()[0]
+        deletions = parts[2].split()[0]
+        sub = parts[3].split()[0]
+        return total_words, ins, deletions, sub
+
+
+def get_min_wer(recording_id, num_speakers, WER_dir):
+    best_wer_file = WER_dir + '/' + 'best_wer' + '_' + recording_id
+    best_wer_writer = open(best_wer_file, 'w')
+    m = Munkres()
+    total_error_mat = [0] * num_speakers
+    all_errors_mat = [0] * num_speakers
+    for i in range(num_speakers):
+        total_error_mat[i] = [0] * num_speakers
+        all_errors_mat[i] = [0] * num_speakers
+    for i in range(1, num_speakers+1):
+        for j in range(1, num_speakers+1):
+            filename = '/wer_' + recording_id + '_' + 'r' + str(i)+ 'h' + str(j)
+            filename = WER_dir + filename
+            total_words, ins, deletions, sub = get_results(filename)
+            ins = int(ins)
+            deletions = int(deletions)
+            sub = int(sub)
+            total_error = ins + deletions + sub
+            total_error_mat[i-1][j-1]=total_error
+            all_errors_mat[i-1][j-1]= (total_words, total_error, ins, deletions, sub)
+
+    indexes = m.compute(total_error_mat)
+    total_errors=total_words=total_ins=total_del=total_sub=0
+    spk_order = '('
+    for row, column in indexes:
+        words, errs, ins, deletions, sub = all_errors_mat[row][column]
+        total_errors += int(errs)
+        total_words += int(words)
+        total_ins += int(ins)
+        total_del += int(deletions)
+        total_sub += int(sub)
+        spk_order = spk_order + str(column+1) + ', '
+    spk_order = spk_order + ')' 
+    text = "Best error: (#T #E #I #D #S) " + str(total_words)+ ', '+str(total_errors)+ ', '+str(total_ins)+ ', '+str(total_del)+ ', '+str(total_sub)
+    best_wer_writer.write(" recording_id: "+ recording_id + ' ')
+    best_wer_writer.write(' best hypothesis speaker order: ' + spk_order + ' ')
+    best_wer_writer.write(text+ '\n')
+    best_wer_writer.close()
+
+
+def main():
+    args = get_args()
+    get_min_wer(args.recording_id, int(args.num_speakers), args.WER_dir)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/chime6/s5b_track2/local/get_cache_chime6.py b/egs/chime6/s5b_track2/local/get_cache_chime6.py
new file mode 100755
index 00000000000..5b822fb6fb5
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/get_cache_chime6.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+
+# Copyright   2020   Prisyach Tatyana (STC-innovations Ltd)
+# Apache 2.0.
+
+import json
+import argparse
+import sys
+import re
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        """This script creates chime6.json from ts-vad segments for dev and eval in the format required by pb_chime5
+        e.g. {} segments \\
+                dev \\
+                CHiME6/audio/dev \\
+                pb_chime5/cache/chime6.json""".format(sys.argv[0]))
+
+    parser.add_argument("segments", help="""ts-vad segments""")
+    parser.add_argument("dset_type", help="""dataset name (dev or eval)""")
+    parser.add_argument("dir_chime6", help="""chime6 data directory to dev or eval""")
+    parser.add_argument("json", help="""path to chime6.json""")
+    args = parser.parse_args()
+    return args
+
+def create_main_fields(dset_type):
+    if dset_type == "dev":
+        to_json = { "alias": {"dev":["S02","S09"]}, "datasets": {"S02": {}, "S09": {}}}
+    elif dset_type == "eval":
+        to_json = { "alias": {"eval":["S01","S21"]}, "datasets": {"S01": {}, "S21": {}}}
+    return to_json
+
+def create_utt_field(dset_type, utt_name, ses, spk, time_start, time_end, fd, dir_chime6):
+    if dset_type == "dev":
+        if ses == "S02":
+            dic_spk = ["P05", "P06", "P07", "P08"]
+            kinects = ["U01", "U02", "U03", "U04", "U05", "U06"]
+        elif ses == "S09":
+            dic_spk = ["P25", "P26", "P27", "P28"]
+            kinects = ["U01", "U02", "U03", "U04", "U06"]
+    elif dset_type == "eval":
+        if ses == "S01":
+            dic_spk = ["P01", "P02", "P03", "P04"]
+            kinects = ["U01", "U02", "U04", "U05", "U06"]
+        elif ses == "S21":
+            dic_spk = ["P45", "P46", "P47", "P48"]
+            kinects = ["U01", "U02", "U03", "U04", "U05", "U06"]
+
+    start = int(time_start * fd)
+    end = int(time_end * fd)
+    to_json = {"audio_path": {"observation": " "}, "end": end, "gender": " ", "location": " ", "notes": [], "num_samples": " ", "reference_array": " ", "session_id": " ", "speaker_id": " ", "start": start, "transcription": " "}
+    to_json["num_samples"] = end - start
+    to_json["session_id"] = ses
+    to_json["speaker_id"] = dic_spk[spk-1]
+    
+    channels = 4
+    to_json["audio_path"]["observation"] = {}
+    for kinect in kinects:
+        kinect_ch = []
+        for ch in range(channels):
+            kinect_ch.append(dir_chime6 + "/" + ses + "_" + kinect + "." + "CH" + str(ch+1) + ".wav")
+        to_json["audio_path"]["observation"][kinect] = kinect_ch
+    return to_json
+
+def main():
+    args = get_args()
+
+    fd = 16000
+
+    print("dset_type=", args.dset_type)
+    json_chime6 = create_main_fields(args.dset_type)
+
+    utt_list = open(args.segments).readlines()
+
+    f_json = open(args.json, 'w')
+
+    for utt in utt_list:
+        utt_name, wav, time_start, time_end = utt.split(None)
+        ses, kinect, spk, start, end = re.split('_|-', utt_name)
+        utt_field = create_utt_field(args.dset_type, utt_name, ses, int(spk), float(time_start), float(time_end), fd, args.dir_chime6)
+        json_chime6["datasets"][ses][utt_name] = utt_field
+
+    f_json.write(json.dumps(json_chime6, indent=4))
+
+if __name__ == '__main__':
+    main()
+
diff --git a/egs/chime6/s5b_track2/local/get_cache_chime6.sh b/egs/chime6/s5b_track2/local/get_cache_chime6.sh
new file mode 100755
index 00000000000..f4bfd6de3f0
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/get_cache_chime6.sh
@@ -0,0 +1,53 @@
+#!/bin/bash -u
+
+# Copyright   2020   Prisyach Tatyana (STC-innovations Ltd)
+# Apache 2.0.
+
+segments=$1
+dset=$2
+dir_chime6=$3
+json=$4
+
+if [ $dset == "dev" ]; then
+  awk -F "_|-| " -v dir_chime6=$dir_chime6 -v S02="false" -v S09="false" '
+                  BEGIN { printf"%s","{\n    \"alias\": {\n        \"dev\": [\n            \"S02\",\n            \"S09\"\n        ]\n    },\n    \"datasets\": {\n"; }
+                  {if (($1 == "S02" && S02 == "true") || ($1 == "S09" && S09 == "true")) { printf"%s",",\n"; }
+                  else if ($1 == "S09" && S09 == "false") { printf"%s","\n        },\n"; }
+                  if ($1 == "S02") {
+                    if (S02 == "false") { printf"%s","        \"S02\": {\n"; S02="true"; }
+                    if ($3 == "1") { spk="P05"; }
+                    else if ($3 == "2") { spk="P06"; }
+                    else if ($3 == "3") { spk="P07"; }
+                    else { spk="P08"; }
+                    printf"%s","            \"" $1"_"$2"-"$3"-"$4"-"$5 "\": {\n                \"audio_path\": {\n                    \"observation\": {\n                        \"U01\": [\n                            \""dir_chime6"/S02_U01.CH1.wav\",\n                            \""dir_chime6"/S02_U01.CH2.wav\",\n                            \""dir_chime6"/S02_U01.CH3.wav\",\n                            \""dir_chime6"/S02_U01.CH4.wav\"\n                        ],\n                        \"U02\": [\n                            \""dir_chime6"/S02_U02.CH1.wav\",\n                            \""dir_chime6"/S02_U02.CH2.wav\",\n                            \""dir_chime6"/S02_U02.CH3.wav\",\n                            \""dir_chime6"/S02_U02.CH4.wav\"\n                        ],\n                        \"U03\": [\n                            \""dir_chime6"/S02_U03.CH1.wav\",\n                            \""dir_chime6"/S02_U03.CH2.wav\",\n                            \""dir_chime6"/S02_U03.CH3.wav\",\n                            \""dir_chime6"/S02_U03.CH4.wav\"\n                        ],\n                        \"U04\": [\n                            \""dir_chime6"/S02_U04.CH1.wav\",\n                            \""dir_chime6"/S02_U04.CH2.wav\",\n                            \""dir_chime6"/S02_U04.CH3.wav\",\n                            \""dir_chime6"/S02_U04.CH4.wav\"\n                        ],\n                        \"U05\": [\n                            \""dir_chime6"/S02_U05.CH1.wav\",\n                            \""dir_chime6"/S02_U05.CH2.wav\",\n                            \""dir_chime6"/S02_U05.CH3.wav\",\n                            \""dir_chime6"/S02_U05.CH4.wav\"\n                        ],\n                        \"U06\": [\n                            \""dir_chime6"/S02_U06.CH1.wav\",\n                            \""dir_chime6"/S02_U06.CH2.wav\",\n                            \""dir_chime6"/S02_U06.CH3.wav\",\n                            \""dir_chime6"/S02_U06.CH4.wav\"\n                        ]\n                    },\n                    \"worn\": {\n                        \"P05\":\n                            \""dir_chime6"/S02_P05.wav\",\n                        \"P06\":\n                            \""dir_chime6"/S02_P06.wav\",\n                        \"P07\":\n                            \""dir_chime6"/S02_P07.wav\",\n                        \"P08\":\n                            \""dir_chime6"/S02_P08.wav\"\n                    }\n                },\n                \"end\": "int($9*16000)",\n                \"gender\": \""unk"\",\n                \"location\": \""unk"\",\n                \"notes\": [],\n                \"num_samples\": "int($9*16000-$8*16000)",\n                \"reference_array\": \""unk"\",\n                \"session_id\": \"S02\",\n                \"speaker_id\": \""spk"\",\n                \"start\": "int($8*16000)",\n                \"transcription\": \"\"\n            }";
+                  } else if ($1 == "S09") {
+                    if (S09 == "false") { printf"%s","        \"S09\": {\n"; S09="true"; }
+                    if ($3 == "1") { spk="P25"; }
+                    else if ($3 == "2") { spk="P26"; }
+                    else if ($3 == "3") { spk="P27"; }
+                    else { spk="P28"; }
+                    printf"%s","            \"" $1"_"$2"-"$3"-"$4"-"$5  "\": {\n                \"audio_path\": {\n                    \"observation\": {\n                        \"U01\": [\n                            \""dir_chime6"/S09_U01.CH1.wav\",\n                            \""dir_chime6"/S09_U01.CH2.wav\",\n                            \""dir_chime6"/S09_U01.CH3.wav\",\n                            \""dir_chime6"/S09_U01.CH4.wav\"\n                        ],\n                        \"U02\": [\n                            \""dir_chime6"/S09_U02.CH1.wav\",\n                            \""dir_chime6"/S09_U02.CH2.wav\",\n                            \""dir_chime6"/S09_U02.CH3.wav\",\n                            \""dir_chime6"/S09_U02.CH4.wav\"\n                        ],\n                        \"U03\": [\n                            \""dir_chime6"/S09_U03.CH1.wav\",\n                            \""dir_chime6"/S09_U03.CH2.wav\",\n                            \""dir_chime6"/S09_U03.CH3.wav\",\n                            \""dir_chime6"/S09_U03.CH4.wav\"\n                        ],\n                        \"U04\": [\n                            \""dir_chime6"/S09_U04.CH1.wav\",\n                            \""dir_chime6"/S09_U04.CH2.wav\",\n                            \""dir_chime6"/S09_U04.CH3.wav\",\n                            \""dir_chime6"/S09_U04.CH4.wav\"\n                        ],\n                        \"U06\": [\n                            \""dir_chime6"/S09_U06.CH1.wav\",\n                            \""dir_chime6"/S09_U06.CH2.wav\",\n                            \""dir_chime6"/S09_U06.CH3.wav\",\n                            \""dir_chime6"/S09_U06.CH4.wav\"\n                        ]\n                    },\n                    \"worn\": {\n                        \"P25\":\n                            \""dir_chime6"/S09_P25.wav\",\n                        \"P26\":\n                            \""dir_chime6"/S09_P26.wav\",\n                        \"P27\":\n                            \""dir_chime6"/S09_P27.wav\",\n                        \"P28\":\n                            \""dir_chime6"/S09_P28.wav\"\n                    }\n                },\n                \"end\": "int($9*16000)",\n                \"gender\": \""unk"\",\n                \"location\": \""unk"\",\n                \"notes\": [],\n                \"num_samples\": "int($9*16000-$8*16000)",\n                \"reference_array\": \""unk"\",\n                \"session_id\": \"S09\",\n                \"speaker_id\": \""spk"\",\n                \"start\": "int($8*16000)",\n                \"transcription\": \"\"\n            }";}
+                  }
+                  END { printf"%s","\n        }\n    }\n}\n"; }' $segments > $json
+elif [ $dset == "eval" ]; then
+  awk -F "_|-| " -v dir_chime6=$dir_chime6 -v S01="false" -v S21="false" '
+                  BEGIN { printf"%s","{\n    \"alias\": {\n        \"eval\": [\n            \"S01\",\n            \"S21\"\n        ]\n    },\n    \"datasets\": {\n"; }
+                  {if (($1 == "S01" && S01 == "true") || ($1 == "S21" && S21 == "true")) { printf"%s",",\n"; }
+                  else if ($1 == "S21" && S21 == "false") { printf"%s","\n        },\n"; }
+                  if ($1 == "S01"){
+                    if (S01 == "false") { printf"%s","        \"S01\": {\n"; S01="true"; }
+                    if ($3 == "1") { spk="P01"; }
+                    else if ($3 == "2") { spk="P02"; }
+                    else if ($3 == "3") { spk="P03"; }
+                    else { spk="P04"; }
+                    printf"%s","            \"" $1"_"$2"-"$3"-"$4"-"$5  "\": {\n                \"audio_path\": {\n                    \"observation\": {\n                        \"U01\": [\n                            \""dir_chime6"/S01_U01.CH1.wav\",\n                            \""dir_chime6"/S01_U01.CH2.wav\",\n                            \""dir_chime6"/S01_U01.CH3.wav\",\n                            \""dir_chime6"/S01_U01.CH4.wav\"\n                        ],\n                        \"U02\": [\n                            \""dir_chime6"/S01_U02.CH1.wav\",\n                            \""dir_chime6"/S01_U02.CH2.wav\",\n                            \""dir_chime6"/S01_U02.CH3.wav\",\n                            \""dir_chime6"/S01_U02.CH4.wav\"\n                        ],\n                        \"U04\": [\n                            \""dir_chime6"/S01_U04.CH1.wav\",\n                            \""dir_chime6"/S01_U04.CH2.wav\",\n                            \""dir_chime6"/S01_U04.CH3.wav\",\n                            \""dir_chime6"/S01_U04.CH4.wav\"\n                        ],\n                        \"U05\": [\n                            \""dir_chime6"/S01_U05.CH1.wav\",\n                            \""dir_chime6"/S01_U05.CH2.wav\",\n                            \""dir_chime6"/S01_U05.CH3.wav\",\n                            \""dir_chime6"/S01_U05.CH4.wav\"\n                        ],\n                        \"U06\": [\n                            \""dir_chime6"/S01_U06.CH1.wav\",\n                            \""dir_chime6"/S01_U06.CH2.wav\",\n                            \""dir_chime6"/S01_U06.CH3.wav\",\n                            \""dir_chime6"/S01_U06.CH4.wav\"\n                                                ]\n                                        }\n                },\n                \"end\": "int($9*16000)",\n                \"gender\": \""unk"\",\n                \"location\": \""unk"\",\n                \"notes\": [],\n                \"num_samples\": "int($9*16000-$8*16000)",\n                \"reference_array\": \""unk"\",\n                \"session_id\": \"S01\",\n                \"speaker_id\": \""spk"\",\n                \"start\": "int($8*16000)",\n                \"transcription\": \"\"\n            }";
+                  } else if ($1 == "S21") {
+                    if (S21 == "false") { printf"%s","        \"S21\": {\n"; S21="true"; }
+                    if ($3 == "1") { spk="P45"; }
+                    else if ($3 == "2") { spk="P46"; }
+                    else if ($3 == "3") { spk="P47"; }
+                    else { spk="P48"; }
+                    printf"%s","            \"" $1"_"$2"-"$3"-"$4"-"$5  "\": {\n                \"audio_path\": {\n                    \"observation\": {\n                        \"U01\": [\n                            \""dir_chime6"/S21_U01.CH1.wav\",\n                            \""dir_chime6"/S21_U01.CH2.wav\",\n                            \""dir_chime6"/S21_U01.CH3.wav\",\n                            \""dir_chime6"/S21_U01.CH4.wav\"\n                        ],\n                        \"U02\": [\n                            \""dir_chime6"/S21_U02.CH1.wav\",\n                            \""dir_chime6"/S21_U02.CH2.wav\",\n                            \""dir_chime6"/S21_U02.CH3.wav\",\n                            \""dir_chime6"/S21_U02.CH4.wav\"\n                        ],\n                        \"U03\": [\n                            \""dir_chime6"/S21_U03.CH1.wav\",\n                            \""dir_chime6"/S21_U03.CH2.wav\",\n                            \""dir_chime6"/S21_U03.CH3.wav\",\n                            \""dir_chime6"/S21_U03.CH4.wav\"\n                         ],\n                        \"U04\": [\n                            \""dir_chime6"/S21_U04.CH1.wav\",\n                            \""dir_chime6"/S21_U04.CH2.wav\",\n                            \""dir_chime6"/S21_U04.CH3.wav\",\n                            \""dir_chime6"/S21_U04.CH4.wav\"\n                        ],\n                        \"U05\": [\n                            \""dir_chime6"/S21_U05.CH1.wav\",\n                            \""dir_chime6"/S21_U05.CH2.wav\",\n                            \""dir_chime6"/S21_U05.CH3.wav\",\n                            \""dir_chime6"/S21_U05.CH4.wav\"\n                        ],\n                        \"U06\": [\n                            \""dir_chime6"/S21_U06.CH1.wav\",\n                            \""dir_chime6"/S21_U06.CH2.wav\",\n                            \""dir_chime6"/S21_U06.CH3.wav\",\n                            \""dir_chime6"/S21_U06.CH4.wav\"\n                                                ]\n                    }\n                },\n                \"end\": "int($9*16000)",\n                \"gender\": \""unk"\",\n                \"location\": \""unk"\",\n                \"notes\": [],\n                \"num_samples\": "int($9*16000-$8*16000)",\n                \"reference_array\": \""unk"\",\n                \"session_id\": \"S21\",\n                \"speaker_id\": \""spk"\",\n                \"start\": "int($8*16000)",\n                \"transcription\": \"\"\n            }";}
+                  }
+                  END { printf"%s","\n        }\n    }\n}\n"; }' $segments > $json
+fi
diff --git a/egs/chime6/s5b_track2/local/get_hyp_perspeaker_perarray_file.py b/egs/chime6/s5b_track2/local/get_hyp_perspeaker_perarray_file.py
new file mode 100755
index 00000000000..091cf7c05b1
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/get_hyp_perspeaker_perarray_file.py
@@ -0,0 +1,63 @@
+#! /usr/bin/env python
+# Copyright   2019   Ashish Arora
+# Apache 2.0.
+"""This script splits a kaldi (text) file
+  into per_array per_session per_speaker hypothesis (text) files"""
+
+import argparse
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="""This script splits a kaldi text file
+        into per_array per_session per_speaker  text files""")
+    parser.add_argument("input_text_path", type=str,
+                        help="path of text files")
+    parser.add_argument("output_dir_path", type=str,
+                        help="Output path for per_array per_session per_speaker reference files")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    # S09_U06.ENH-4-704588-704738
+    args = get_args()
+    sessionid_micid_speakerid_dict= {}
+    for line in open(args.input_text_path):
+        parts = line.strip().split()
+        uttid_id = parts[0]
+        temp = uttid_id.strip().split('.')[0]
+        micid = temp.strip().split('_')[1]
+        speakerid = uttid_id.strip().split('-')[1]
+        sessionid = uttid_id.strip().split('_')[0]
+        sessionid_micid_speakerid = sessionid + '_' + micid + '_' + speakerid
+        if sessionid_micid_speakerid not in sessionid_micid_speakerid_dict:
+            sessionid_micid_speakerid_dict[sessionid_micid_speakerid]=list()
+        sessionid_micid_speakerid_dict[sessionid_micid_speakerid].append(line)
+
+    for sessionid_micid_speakerid in sorted(sessionid_micid_speakerid_dict):
+        hyp_file = args.output_dir_path + '/' + 'hyp' + '_' + sessionid_micid_speakerid
+        hyp_writer = open(hyp_file, 'w')
+        combined_hyp_file = args.output_dir_path + '/' + 'hyp' + '_' + sessionid_micid_speakerid + '_comb'
+        combined_hyp_writer = open(combined_hyp_file, 'w')
+        utterances = sessionid_micid_speakerid_dict[sessionid_micid_speakerid]
+        # sorting utterances by start and end time
+        sessionid_micid_speakerid_utterances={}
+        for line in utterances:
+            parts = line.strip().split()
+            utt_parts = parts[0].strip().split('-')
+            time ='-'.join(utt_parts[2:])
+            sessionid_micid_speakerid_utterances[time] = line
+        text = ''
+        for time_key in sorted(sessionid_micid_speakerid_utterances):
+            parts = sessionid_micid_speakerid_utterances[time_key].strip().split()
+            text = text + ' ' + ' '.join(parts[1:])
+            hyp_writer.write(sessionid_micid_speakerid_utterances[time_key])
+        combined_utterance = 'utt' + " " + text
+        combined_hyp_writer.write(combined_utterance)
+        combined_hyp_writer.write('\n')
+        combined_hyp_writer.close()
+        hyp_writer.close()
+
+
+if __name__ == '__main__':
+    main()
+
diff --git a/egs/chime6/s5b_track2/local/get_ref_perspeaker_persession_file.py b/egs/chime6/s5b_track2/local/get_ref_perspeaker_persession_file.py
new file mode 100755
index 00000000000..a4394984876
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/get_ref_perspeaker_persession_file.py
@@ -0,0 +1,86 @@
+#! /usr/bin/env python
+# Copyright   2019   Ashish Arora
+# Apache 2.0.
+"""This script splits a kaldi (text) file
+  into per_speaker per_session reference (text) file"""
+
+import argparse
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="""This script splits a kaldi text file
+        into per_speaker per_session text files""")
+    parser.add_argument("input_text_path", type=str,
+                        help="path of text file")
+    parser.add_argument("output_dir_path", type=str,
+                        help="Output path for per_session per_speaker reference files")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+    sessionid_speakerid_dict= {}
+    spkrid_mapping = {}
+    for line in open(args.input_text_path):
+        parts = line.strip().split()
+        uttid_id = parts[0]
+        speakerid = uttid_id.strip().split('_')[0]
+        sessionid = uttid_id.strip().split('_')[1]
+        sessionid_speakerid = sessionid + '_' + speakerid
+        if sessionid_speakerid not in sessionid_speakerid_dict:
+            sessionid_speakerid_dict[sessionid_speakerid]=list()
+        sessionid_speakerid_dict[sessionid_speakerid].append(line)
+
+    spkr_num = 1
+    prev_sessionid = ''
+    for sessionid_speakerid in sorted(sessionid_speakerid_dict):
+        spkr_id = sessionid_speakerid.strip().split('_')[1]
+        curr_sessionid = sessionid_speakerid.strip().split('_')[0]
+        if prev_sessionid != curr_sessionid:
+            prev_sessionid = curr_sessionid
+            spkr_num = 1
+        if spkr_id not in spkrid_mapping:
+            spkrid_mapping[spkr_id] = spkr_num
+            spkr_num += 1
+
+    for sessionid_speakerid in sorted(sessionid_speakerid_dict):
+        ref_file = args.output_dir_path + '/ref_' + sessionid_speakerid.split('_')[0] + '_' + str(
+            spkrid_mapping[sessionid_speakerid.split('_')[1]])
+        ref_writer = open(ref_file, 'w')
+        wc_file = args.output_dir_path + '/ref_wc_' + sessionid_speakerid.split('_')[0] + '_' + str(
+            spkrid_mapping[sessionid_speakerid.split('_')[1]])
+        wc_writer = open(wc_file, 'w')
+        combined_ref_file = args.output_dir_path + '/ref_' + sessionid_speakerid.split('_')[0] + '_' + str(
+            spkrid_mapping[sessionid_speakerid.split('_')[1]]) + '_comb'
+        combined_ref_writer = open(combined_ref_file, 'w')
+        utterances = sessionid_speakerid_dict[sessionid_speakerid]
+        sessionid_speakerid_utterances = {}
+        # sorting utterances by start and end time
+        for line in utterances:
+            parts = line.strip().split()
+            utt_parts = parts[0].strip().split('-')
+            time ='-'.join(utt_parts[1:])
+            sessionid_speakerid_utterances[time] = line
+        text = ''
+        uttid_wc = 'utt'
+        for time_key in sorted(sessionid_speakerid_utterances):
+            parts = sessionid_speakerid_utterances[time_key].strip().split()
+            uttid_id = parts[0]
+            utt_text = ' '.join(parts[1:])
+            text = text + ' ' + ' '.join(parts[1:])
+            ref_writer.write(sessionid_speakerid_utterances[time_key])
+            length = str(len(utt_text.split()))
+            uttid_id_len = uttid_id + ":" + length
+            uttid_wc = uttid_wc + ' ' + uttid_id_len
+        combined_utterance = 'utt' + " " + text
+        combined_ref_writer.write(combined_utterance)
+        combined_ref_writer.write('\n')
+        combined_ref_writer.close()
+        wc_writer.write(uttid_wc)
+        wc_writer.write('\n')
+        wc_writer.close()
+        ref_writer.close()
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/chime6/s5b_track2/local/install_dscore.sh b/egs/chime6/s5b_track2/local/install_dscore.sh
new file mode 100755
index 00000000000..314f86f938e
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/install_dscore.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+# Installs dscore
+git clone https://github.com/nryant/dscore.git
+pip3 install intervaltree --user
+pip3 install tabulate --user
+pip3 install munkres --user
+pip3 install pytest --user
diff --git a/egs/chime6/s5b_track2/local/install_pb_chime5.sh b/egs/chime6/s5b_track2/local/install_pb_chime5.sh
new file mode 120000
index 00000000000..ce5ea5f9f08
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/install_pb_chime5.sh
@@ -0,0 +1 @@
+../../s5_track1/local/install_pb_chime5.sh
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/json2text.py b/egs/chime6/s5b_track2/local/json2text.py
new file mode 120000
index 00000000000..2aa0a8dd1f9
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/json2text.py
@@ -0,0 +1 @@
+../../s5_track1/local/json2text.py
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/make_noise_list.py b/egs/chime6/s5b_track2/local/make_noise_list.py
new file mode 120000
index 00000000000..d8dcc7822fc
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/make_noise_list.py
@@ -0,0 +1 @@
+../../s5_track1/local/make_noise_list.py
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/make_voxceleb1.pl b/egs/chime6/s5b_track2/local/make_voxceleb1.pl
new file mode 100755
index 00000000000..2268c20ab52
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/make_voxceleb1.pl
@@ -0,0 +1,130 @@
+#!/usr/bin/perl
+#
+# Copyright 2018  Ewald Enzinger
+#           2018  David Snyder
+#
+# Usage: make_voxceleb1.pl /export/voxceleb1 data/
+
+if (@ARGV != 2) {
+  print STDERR "Usage: $0 <path-to-voxceleb1> <path-to-data-dir>\n";
+  print STDERR "e.g. $0 /export/voxceleb1 data/\n";
+  exit(1);
+}
+
+($data_base, $out_dir) = @ARGV;
+my $out_test_dir = "$out_dir/voxceleb1_test";
+my $out_train_dir = "$out_dir/voxceleb1_train";
+
+if (system("mkdir -p $out_test_dir") != 0) {
+  die "Error making directory $out_test_dir";
+}
+
+if (system("mkdir -p $out_train_dir") != 0) {
+  die "Error making directory $out_train_dir";
+}
+
+opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!";
+my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh);
+closedir $dh;
+
+if (! -e "$data_base/voxceleb1_test.txt") {
+  system("wget -O $data_base/voxceleb1_test.txt http://www.openslr.org/resources/49/voxceleb1_test.txt");
+}
+
+if (! -e "$data_base/vox1_meta.csv") {
+  system("wget -O $data_base/vox1_meta.csv http://www.openslr.org/resources/49/vox1_meta.csv");
+}
+
+open(TRIAL_IN, "<", "$data_base/voxceleb1_test.txt") or die "Could not open the verification trials file $data_base/voxceleb1_test.txt";
+open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv";
+open(SPKR_TEST, ">", "$out_test_dir/utt2spk") or die "Could not open the output file $out_test_dir/utt2spk";
+open(WAV_TEST, ">", "$out_test_dir/wav.scp") or die "Could not open the output file $out_test_dir/wav.scp";
+open(SPKR_TRAIN, ">", "$out_train_dir/utt2spk") or die "Could not open the output file $out_train_dir/utt2spk";
+open(WAV_TRAIN, ">", "$out_train_dir/wav.scp") or die "Could not open the output file $out_train_dir/wav.scp";
+open(TRIAL_OUT, ">", "$out_test_dir/trials") or die "Could not open the output file $out_test_dir/trials";
+
+my %id2spkr = ();
+while (<META_IN>) {
+  chomp;
+  my ($vox_id, $spkr_id, $gender, $nation, $set) = split;
+  $id2spkr{$vox_id} = $spkr_id;
+}
+
+my $test_spkrs = ();
+while (<TRIAL_IN>) {
+  chomp;
+  my ($tar_or_non, $path1, $path2) = split;
+
+  # Create entry for left-hand side of trial
+  my ($spkr_id, $filename) = split('/', $path1);
+  my $rec_id = substr($filename, 0, 11);
+  my $segment = substr($filename, 12, 7);
+  my $utt_id1 = "$spkr_id-$rec_id-$segment";
+  $test_spkrs{$spkr_id} = ();
+
+  # Create entry for right-hand side of trial
+  my ($spkr_id, $filename) = split('/', $path2);
+  my $rec_id = substr($filename, 0, 11);
+  my $segment = substr($filename, 12, 7);
+  my $utt_id2 = "$spkr_id-$rec_id-$segment";
+  $test_spkrs{$spkr_id} = ();
+
+  my $target = "nontarget";
+  if ($tar_or_non eq "1") {
+    $target = "target";
+  }
+  print TRIAL_OUT "$utt_id1 $utt_id2 $target\n";
+}
+
+foreach (@spkr_dirs) {
+  my $spkr_id = $_;
+  my $new_spkr_id = $spkr_id;
+  # If we're using a newer version of VoxCeleb1, we need to "deanonymize"
+  # the speaker labels.
+  if (exists $id2spkr{$spkr_id}) {
+    $new_spkr_id = $id2spkr{$spkr_id};
+  }
+  opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!";
+  my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
+  closedir $dh;
+  foreach (@files) {
+    my $filename = $_;
+    my $rec_id = substr($filename, 0, 11);
+    my $segment = substr($filename, 12, 7);
+    my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav";
+    my $utt_id = "$new_spkr_id-$rec_id-$segment";
+    if (exists $test_spkrs{$new_spkr_id}) {
+      print WAV_TEST "$utt_id", " $wav", "\n";
+      print SPKR_TEST "$utt_id", " $new_spkr_id", "\n";
+    } else {
+      print WAV_TRAIN "$utt_id", " $wav", "\n";
+      print SPKR_TRAIN "$utt_id", " $new_spkr_id", "\n";
+    }
+  }
+}
+
+close(SPKR_TEST) or die;
+close(WAV_TEST) or die;
+close(SPKR_TRAIN) or die;
+close(WAV_TRAIN) or die;
+close(TRIAL_OUT) or die;
+close(TRIAL_IN) or die;
+close(META_IN) or die;
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_test_dir/utt2spk >$out_test_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_test_dir";
+}
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_test_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_test_dir") != 0) {
+  die "Error validating directory $out_test_dir";
+}
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_train_dir/utt2spk >$out_train_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_train_dir";
+}
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_train_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_train_dir") != 0) {
+  die "Error validating directory $out_train_dir";
+}
diff --git a/egs/chime6/s5b_track2/local/make_voxceleb2.pl b/egs/chime6/s5b_track2/local/make_voxceleb2.pl
new file mode 100755
index 00000000000..34c1591eba3
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/make_voxceleb2.pl
@@ -0,0 +1,70 @@
+#!/usr/bin/perl
+#
+# Copyright 2018  Ewald Enzinger
+#
+# Usage: make_voxceleb2.pl /export/voxceleb2 dev data/dev
+#
+# Note: This script requires ffmpeg to be installed and its location included in $PATH.
+
+if (@ARGV != 3) {
+  print STDERR "Usage: $0 <path-to-voxceleb2> <dataset> <path-to-data-dir>\n";
+  print STDERR "e.g. $0 /export/voxceleb2 dev data/dev\n";
+  exit(1);
+}
+
+# Check that ffmpeg is installed.
+if (`which ffmpeg` eq "") {
+  die "Error: this script requires that ffmpeg is installed.";
+}
+
+($data_base, $dataset, $out_dir) = @ARGV;
+
+if ("$dataset" ne "dev" && "$dataset" ne "test") {
+  die "dataset parameter must be 'dev' or 'test'!";
+}
+
+opendir my $dh, "$data_base/$dataset/aac" or die "Cannot open directory: $!";
+my @spkr_dirs = grep {-d "$data_base/$dataset/aac/$_" && ! /^\.{1,2}$/} readdir($dh);
+closedir $dh;
+
+if (system("mkdir -p $out_dir") != 0) {
+  die "Error making directory $out_dir";
+}
+
+open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
+open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
+
+foreach (@spkr_dirs) {
+  my $spkr_id = $_;
+
+  opendir my $dh, "$data_base/$dataset/aac/$spkr_id/" or die "Cannot open directory: $!";
+  my @rec_dirs = grep {-d "$data_base/$dataset/aac/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
+  closedir $dh;
+
+  foreach (@rec_dirs) {
+    my $rec_id = $_;
+
+    opendir my $dh, "$data_base/$dataset/aac/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
+    my @files = map{s/\.[^.]+$//;$_}grep {/\.m4a$/} readdir($dh);
+    closedir $dh;
+
+    foreach (@files) {
+      my $name = $_;
+      my $wav = "ffmpeg -v 8 -i $data_base/$dataset/aac/$spkr_id/$rec_id/$name.m4a -f wav -acodec pcm_s16le -|";
+      my $utt_id = "$spkr_id-$rec_id-$name";
+      print WAV "$utt_id", " $wav", "\n";
+      print SPKR "$utt_id", " $spkr_id", "\n";
+    }
+  }
+}
+close(SPKR) or die;
+close(WAV) or die;
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_dir";
+}
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+  die "Error validating directory $out_dir";
+}
diff --git a/egs/chime6/s5b_track2/local/multispeaker_score.sh b/egs/chime6/s5b_track2/local/multispeaker_score.sh
new file mode 100755
index 00000000000..c7075d6cf14
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/multispeaker_score.sh
@@ -0,0 +1,144 @@
+#!/usr/bin/env bash
+# Copyright   2019   Ashish Arora, Yusuke Fujita
+# Apache 2.0.
+# This script takes a reference and hypothesis text file, and performs 
+# multispeaker scoring.
+
+stage=0
+cmd=queue.pl
+num_spkrs=4
+num_hyp_spk=4
+datadir=dev_beamformit_dereverb
+get_stats=true
+all_array=false
+declare -a recording_id_array=("S02_U06" "S09_U06")
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <ref-file> <hyp-file> <out-dir>"
+  echo "e.g.: $0 data/diarized/text data/dev \
+    exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_xvector_sad/scoring_kaldi/penalty_1.0/10.txt \
+    exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_xvector_sad/scoring_kaldi_multispeaker"
+  echo "Options: "
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  exit 1;
+fi
+
+ref_file=$1
+hyp_file=$2
+out_dir=$3
+
+output_dir=$out_dir/per_speaker_output
+wer_dir=$out_dir/per_speaker_wer
+
+# For dev and evaluation set, we take corresopnding arrays
+if [[ ${datadir} == *dev* ]]; then
+  recording_id_array=("S02_U06" "S09_U06")
+fi
+
+if [[ ${datadir} == *eval* ]]; then
+  recording_id_array=("S01_U06" "S21_U06")
+fi
+
+if [[ ${datadir} == *dev* ]] && [[ $all_array == "true" ]]; then
+  recording_id_array=("S02_U01" "S02_U02" "S02_U03" "S02_U04" "S02_U06" "S09_U01" "S09_U02" "S09_U03" "S09_U04" "S09_U06")
+fi
+
+if [[ ${datadir} == *eval* ]] && [[ $all_array == "true" ]]; then
+  recording_id_array=("S01_U01" "S01_U02" "S01_U03" "S01_U04" "S01_U06" "S21_U01" "S21_U02" "S21_U03" "S21_U04" "S21_U06")
+fi
+
+for f in $ref_file $hyp_file; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+if [ $stage -le 0 ]; then
+  # generate per speaker per session file at paragraph level for the reference"
+  # and per speaker per array file at paraghaph level for the hypothesis"
+  mkdir -p $output_dir $wer_dir
+  local/wer_output_filter < $ref_file > $output_dir/ref_filt.txt
+  local/wer_output_filter < $hyp_file > $output_dir/hyp_filt.txt
+  local/get_ref_perspeaker_persession_file.py $output_dir/ref_filt.txt $output_dir
+  local/get_hyp_perspeaker_perarray_file.py $output_dir/hyp_filt.txt $output_dir
+fi
+
+if [ $stage -le 1 ]; then
+  if [ $num_hyp_spk -le 3 ]; then
+    # create dummy per speaker per array hypothesis files for if the"
+    # perdicted number of speakers by diarization is less than 4 "
+    for recording_id in "${recording_id_array[@]}"; do
+      for (( i=$num_hyp_spk+1; i<$num_spkrs+1; i++ )); do
+        echo 'utt ' > ${dir}/hyp_${recording_id}_${i}_comb
+      done
+    done
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # calculate wer for each ref and hypothesis speaker"
+  for recording_id in "${recording_id_array[@]}"; do
+    for (( i=0; i<$((num_spkrs * num_spkrs)); i++ )); do
+      ind_r=$((i / num_spkrs + 1))
+      ind_h=$((i % num_spkrs + 1))
+      sessionid="$(echo $recording_id | cut -d'_' -f1)"
+
+      # compute WER with combined texts
+      compute-wer --text --mode=present ark:${output_dir}/ref_${sessionid}_${ind_r}_comb \
+        ark:${output_dir}/hyp_${recording_id}_${ind_h}_comb \
+        > $wer_dir/wer_${recording_id}_r${ind_r}h${ind_h} 2>/dev/null
+    done
+
+    local/get_best_error.py $wer_dir $recording_id $num_spkrs
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  # print best word error rate"
+  # it will print best wer for each recording and each array"
+  cat $wer_dir/best_wer* > $wer_dir/all.txt
+  cat $wer_dir/all.txt | local/print_dset_error.py \
+    $output_dir/recordinid_spkorder > $wer_dir/array_wer.txt
+fi
+
+if [ $stage -le 4 ]; then
+  # checks if DP result of total error is equivalent
+  # to the sum of the individual errors:
+  local/check_dset_error.py $wer_dir $output_dir
+fi
+
+if [ $stage -le 5 ] && [[ $get_stats == "true" ]]; then
+  # generate per utterance wer details at utterance level
+  mkdir -p $wer_dir/wer_details $wer_dir/wer_details/log/
+  while read -r line;
+  do
+    recording_id=$(echo "$line" | cut -f1 -d ":")
+    spkorder_str=$(echo "$line" | cut -f2 -d ":")
+    sessionid=$(echo "$line" | cut -f1 -d "_")
+    IFS='_' read -r -a spkorder_list <<< "$spkorder_str"
+    IFS=" "
+    ind_r=1
+    for ind_h in "${spkorder_list[@]}"; do
+
+      $cmd $wer_dir/wer_details/log/${recording_id}_r${ind_r}h${ind_h}_comb.log \
+        align-text ark:${output_dir}/ref_${sessionid}_${ind_r}_comb ark:${output_dir}/hyp_${recording_id}_${ind_h}_comb ark:$output_dir/alignment_${sessionid}_r${ind_r}h${ind_h}.txt
+
+      # split hypothesis texts along with reference utterances using word alignment of combined texts
+      local/gen_aligned_hyp.py $output_dir/alignment_${sessionid}_r${ind_r}h${ind_h}.txt ${output_dir}/ref_wc_${sessionid}_${ind_r} > ${output_dir}/hyp_${recording_id}_r${ind_r}h${ind_h}_ref_segmentation
+
+      ## compute per utterance alignments
+      $cmd $wer_dir/wer_details/log/${recording_id}_r${ind_r}h${ind_h}_per_utt.log \
+        cat ${output_dir}/hyp_${recording_id}_r${ind_r}h${ind_h}_ref_segmentation \| \
+        align-text --special-symbol="'***'" ark:${output_dir}/ref_${sessionid}_${ind_r} ark:- ark,t:- \|  \
+        utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $wer_dir/wer_details/per_utt_${recording_id}_r${ind_r}h${ind_h} || exit 1
+
+      $cmd $wer_dir/wer_details/log/${recording_id}_r${ind_r}h${ind_h}_ops.log \
+        cat $wer_dir/wer_details/per_utt_${recording_id}_r${ind_r}h${ind_h} \| \
+        utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
+        sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $wer_dir/wer_details/ops_${recording_id}_r${ind_r}h${ind_h} || exit 1;
+
+      ind_r=$(( ind_r + 1 ))
+    done
+  done < $output_dir/recordinid_spkorder
+  # done generating per utterance wer details
+fi
diff --git a/egs/chime6/s5b_track2/local/nnet3/compare_wer.sh b/egs/chime6/s5b_track2/local/nnet3/compare_wer.sh
new file mode 120000
index 00000000000..87041e833d0
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/nnet3/compare_wer.sh
@@ -0,0 +1 @@
+../../../s5_track1/local/nnet3/compare_wer.sh
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/nnet3/decode.sh b/egs/chime6/s5b_track2/local/nnet3/decode.sh
new file mode 120000
index 00000000000..32595ccedbc
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/nnet3/decode.sh
@@ -0,0 +1 @@
+../../../s5_track1/local/nnet3/decode.sh
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/nnet3/run_ivector_common.sh b/egs/chime6/s5b_track2/local/nnet3/run_ivector_common.sh
new file mode 120000
index 00000000000..4161993c225
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/nnet3/run_ivector_common.sh
@@ -0,0 +1 @@
+../../../s5_track1/local/nnet3/run_ivector_common.sh
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/nnet3/xvector/prepare_feats.sh b/egs/chime6/s5b_track2/local/nnet3/xvector/prepare_feats.sh
new file mode 100755
index 00000000000..6b5ccd466c3
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/nnet3/xvector/prepare_feats.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+#
+# Apache 2.0.
+
+# This script applies sliding window CMVN and writes the features to disk.
+#
+# Although this kind of script isn't necessary in speaker recognition recipes,
+# it can be helpful in the diarization recipes.  The script
+# diarization/nnet3/xvector/extract_xvectors.sh extracts x-vectors from very
+# short (e.g., 1-2 seconds) segments.  Therefore, in order to apply the sliding
+# window CMVN in a meaningful way, it must be performed prior to performing
+# the subsegmentation.
+
+nj=40
+cmd="run.pl"
+stage=0
+norm_vars=false
+center=true
+compress=true
+cmn_window=300
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+if [ $# != 3 ]; then
+  echo "Usage: $0 <in-data-dir> <out-data-dir> <feat-dir>"
+  echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --norm-vars <true|false>                         # If true, normalize variances in the sliding window cmvn"
+  exit 1;
+fi
+
+data_in=$1
+data_out=$2
+dir=$3
+
+name=`basename $data_in`
+
+for f in $data_in/feats.scp ; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+# Set various variables.
+mkdir -p $dir/log
+mkdir -p $data_out
+featdir=$(utils/make_absolute.sh $dir)
+
+if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then
+  utils/create_split_dir.pl \
+    /export/b{14,15,16,17}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_cmvn_feats/storage $featdir/storage
+fi
+
+for n in $(seq $nj); do
+  # the next command does nothing unless $featdir/storage/ exists, see
+  # utils/create_data_link.pl for more info.
+  utils/create_data_link.pl $featdir/xvector_cmvn_feats_${name}.${n}.ark
+done
+
+cp $data_in/utt2spk $data_out/utt2spk
+cp $data_in/spk2utt $data_out/spk2utt
+cp $data_in/wav.scp $data_out/wav.scp
+for f in $data_in/segments $data_in/segments/vad.scp ; do
+  [ -f $f ] && cp $f $data_out/`basename $f`;
+done
+
+write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB"
+
+sdata_in=$data_in/split$nj;
+utils/split_data.sh $data_in $nj || exit 1;
+
+$cmd JOB=1:$nj $dir/log/create_xvector_cmvn_feats_${name}.JOB.log \
+  apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \
+  scp:${sdata_in}/JOB/feats.scp ark:- \| \
+  copy-feats --compress=$compress $write_num_frames_opt ark:- \
+  ark,scp:$featdir/xvector_cmvn_feats_${name}.JOB.ark,$featdir/xvector_cmvn_feats_${name}.JOB.scp || exit 1;
+
+for n in $(seq $nj); do
+  cat $featdir/xvector_cmvn_feats_${name}.$n.scp || exit 1;
+done > ${data_out}/feats.scp || exit 1
+
+for n in $(seq $nj); do
+  cat $featdir/log/utt2num_frames.$n || exit 1;
+done > $data_out/utt2num_frames || exit 1
+rm $featdir/log/utt2num_frames.*
+
+echo "$0: Succeeded creating xvector features for $name"
diff --git a/egs/chime6/s5b_track2/local/nnet3/xvector/prepare_feats_for_egs.sh b/egs/chime6/s5b_track2/local/nnet3/xvector/prepare_feats_for_egs.sh
new file mode 100755
index 00000000000..326b6dbb9fa
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/nnet3/xvector/prepare_feats_for_egs.sh
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+#
+# Apache 2.0.
+
+# This script applies sliding window CMVN and removes silence frames.  This
+# is performed on the raw features prior to generating examples for training
+# the x-vector system.  Once the training examples are generated, the features
+# created by this script can be removed.
+
+nj=40
+cmd="run.pl"
+stage=0
+norm_vars=false
+center=true
+compress=true
+cmn_window=300
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+if [ $# != 3 ]; then
+  echo "Usage: $0 <in-data-dir> <out-data-dir> <feat-dir>"
+  echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --norm-vars <true|false>                         # If true, normalize variances in the sliding window cmvn"
+  exit 1;
+fi
+
+data_in=$1
+data_out=$2
+dir=$3
+
+name=`basename $data_in`
+
+for f in $data_in/feats.scp $data_in/vad.scp ; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+# Set various variables.
+mkdir -p $dir/log
+mkdir -p $data_out
+featdir=$(utils/make_absolute.sh $dir)
+
+if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then
+  utils/create_split_dir.pl \
+    /export/b{14,15,16,17}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_feats/storage $featdir/storage
+fi
+
+for n in $(seq $nj); do
+  # the next command does nothing unless $featdir/storage/ exists, see
+  # utils/create_data_link.pl for more info.
+  utils/create_data_link.pl $featdir/xvector_feats_${name}.${n}.ark
+done
+
+cp $data_in/utt2spk $data_out/utt2spk
+cp $data_in/spk2utt $data_out/spk2utt
+cp $data_in/wav.scp $data_out/wav.scp
+
+write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB"
+
+sdata_in=$data_in/split$nj;
+utils/split_data.sh $data_in $nj || exit 1;
+
+$cmd JOB=1:$nj $dir/log/create_xvector_feats_${name}.JOB.log \
+  apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \
+  scp:${sdata_in}/JOB/feats.scp ark:- \| \
+  select-voiced-frames ark:- scp,s,cs:${sdata_in}/JOB/vad.scp ark:- \| \
+  copy-feats --compress=$compress $write_num_frames_opt ark:- \
+  ark,scp:$featdir/xvector_feats_${name}.JOB.ark,$featdir/xvector_feats_${name}.JOB.scp || exit 1;
+
+for n in $(seq $nj); do
+  cat $featdir/xvector_feats_${name}.$n.scp || exit 1;
+done > ${data_out}/feats.scp || exit 1
+
+for n in $(seq $nj); do
+  cat $featdir/log/utt2num_frames.$n || exit 1;
+done > $data_out/utt2num_frames || exit 1
+rm $featdir/log/utt2num_frames.*
+
+echo "$0: Succeeded creating xvector features for $name"
diff --git a/egs/chime6/s5b_track2/local/nnet3/xvector/run_xvector.sh b/egs/chime6/s5b_track2/local/nnet3/xvector/run_xvector.sh
new file mode 120000
index 00000000000..585b63fd2dd
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/nnet3/xvector/run_xvector.sh
@@ -0,0 +1 @@
+tuning/run_xvector_1a.sh
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/nnet3/xvector/tuning/run_xvector_1a.sh b/egs/chime6/s5b_track2/local/nnet3/xvector/tuning/run_xvector_1a.sh
new file mode 100755
index 00000000000..2189e406a7e
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/nnet3/xvector/tuning/run_xvector_1a.sh
@@ -0,0 +1,149 @@
+#!/usr/bin/env bash
+# Copyright      2018   David Snyder
+#                2018   Johns Hopkins University (Author: Daniel Garcia-Romero)
+#                2018   Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# This script trains the x-vector DNN.  The recipe is similar to the one
+# described in "Diarization is Hard: Some Experiences and Lessons Learned
+# for the JHU Team in the Inaugural DIHARD Challenge" by Sell et al.
+
+. ./cmd.sh
+set -e
+
+stage=1
+train_stage=-1
+use_gpu=true
+remove_egs=false
+
+data=data/train
+nnet_dir=exp/xvector_nnet_1a/
+egs_dir=exp/xvector_nnet_1a/egs
+
+. ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+num_pdfs=$(awk '{print $2}' $data/utt2spk | sort | uniq -c | wc -l)
+
+# Now we create the nnet examples using sid/nnet3/xvector/get_egs.sh.
+# The argument --num-repeats is related to the number of times a speaker
+# repeats per archive.  If it seems like you're getting too many archives
+# (e.g., more than 200) try increasing the --frames-per-iter option.  The
+# arguments --min-frames-per-chunk and --max-frames-per-chunk specify the
+# minimum and maximum length (in terms of number of frames) of the features
+# in the examples.
+#
+# To make sense of the egs script, it may be necessary to put an "exit 1"
+# command immediately after stage 3.  Then, inspect
+# exp/<your-dir>/egs/temp/ranges.* . The ranges files specify the examples that
+# will be created, and which archives they will be stored in.  Each line of
+# ranges.* has the following form:
+#    <utt-id> <local-ark-indx> <global-ark-indx> <start-frame> <end-frame> <spk-id>
+# For example:
+#    100304-f-sre2006-kacg-A 1 2 4079 881 23
+
+# If you're satisfied with the number of archives (e.g., 50-150 archives is
+# reasonable) and with the number of examples per speaker (e.g., 1000-5000
+# is reasonable) then you can let the script continue to the later stages.
+# Otherwise, try increasing or decreasing the --num-repeats option.  You might
+# need to fiddle with --frames-per-iter.  Increasing this value decreases the
+# the number of archives and increases the number of examples per archive.
+# Decreasing this value increases the number of archives, while decreasing the
+# number of examples per archive.
+if [ $stage -le 6 ]; then
+  echo "$0: Getting neural network training egs";
+  # dump egs.
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b{03,04,05,06}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/$egs_dir/storage $egs_dir/storage
+  fi
+  sid/nnet3/xvector/get_egs.sh --cmd "$train_cmd" \
+    --nj 8 \
+    --stage 0 \
+    --frames-per-iter 1000000000 \
+    --frames-per-iter-diagnostic 500000 \
+    --min-frames-per-chunk 200 \
+    --max-frames-per-chunk 400 \
+    --num-diagnostic-archives 3 \
+    --num-repeats 40 \
+    "$data" $egs_dir
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(wc -w $egs_dir/pdf2num | awk '{print $1}')
+  feat_dim=$(cat $egs_dir/info/feat_dim)
+
+  # This chunk-size corresponds to the maximum number of frames the
+  # stats layer is able to pool over.  In this script, it corresponds
+  # to 4 seconds.  If the input recording is greater than 4 seconds,
+  # we will compute multiple xvectors from the same recording and average
+  # to produce the final xvector.
+  max_chunk_size=400
+
+  # The smallest number of frames we're comfortable computing an xvector from.
+  # Note that the hard minimum is given by the left and right context of the
+  # frame-level layers.
+  min_chunk_size=20
+  mkdir -p $nnet_dir/configs
+  cat <<EOF > $nnet_dir/configs/network.xconfig
+  # please note that it is important to have input layer with the name=input
+
+  # The frame-level layers
+  input dim=${feat_dim} name=input
+  relu-batchnorm-layer name=tdnn1 input=Append(-2,-1,0,1,2) dim=512
+  relu-batchnorm-layer name=tdnn2 input=Append(-2,0,2) dim=512
+  relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=512
+  relu-batchnorm-layer name=tdnn4 dim=512
+  relu-batchnorm-layer name=tdnn5 dim=1500
+
+  # The stats pooling layer. Layers after this are segment-level.
+  # In the config below, the first and last argument (0, and ${max_chunk_size})
+  # means that we pool over an input segment starting at frame 0
+  # and ending at frame ${max_chunk_size} or earlier.  The other arguments (1:1)
+  # mean that no subsampling is performed.
+  stats-layer name=stats config=mean+stddev(0:1:1:${max_chunk_size})
+
+  # This is where we usually extract the embedding (aka xvector) from.
+  relu-batchnorm-layer name=tdnn6 dim=128 input=stats
+  output-layer name=output include-log-softmax=true dim=${num_targets}
+EOF
+
+  steps/nnet3/xconfig_to_configs.py \
+      --xconfig-file $nnet_dir/configs/network.xconfig \
+      --config-dir $nnet_dir/configs/
+  cp $nnet_dir/configs/final.config $nnet_dir/nnet.config
+
+  # These three files will be used by sid/nnet3/xvector/extract_xvectors.sh
+  echo "output-node name=output input=tdnn6.affine" > $nnet_dir/extract.config
+  echo "$max_chunk_size" > $nnet_dir/max_chunk_size
+  echo "$min_chunk_size" > $nnet_dir/min_chunk_size
+fi
+
+dropout_schedule='0,0@0.20,0.1@0.50,0'
+srand=123
+if [ $stage -le 8 ]; then
+  steps/nnet3/train_raw_dnn.py --stage=$train_stage \
+    --cmd="$train_cmd" \
+    --trainer.optimization.proportional-shrink 10 \
+    --trainer.optimization.momentum=0.5 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=8 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.minibatch-size=64 \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2 \
+    --trainer.num-epochs=3 \
+    --trainer.dropout-schedule="$dropout_schedule" \
+    --trainer.shuffle-buffer-size=1000 \
+    --egs.frames-per-eg=1 \
+    --egs.dir="$egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval=10 \
+    --use-gpu=true \
+    --dir=$nnet_dir  || exit 1;
+fi
+
+exit 0;
diff --git a/egs/chime6/s5b_track2/local/prepare_data.sh b/egs/chime6/s5b_track2/local/prepare_data.sh
new file mode 100755
index 00000000000..8bd2530d6db
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/prepare_data.sh
@@ -0,0 +1,149 @@
+#!/usr/bin/env bash
+#
+# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe, Yenda Trmal)
+# Apache 2.0
+
+# Begin configuration section.
+mictype=worn # worn, ref or others
+cleanup=true
+train=true
+
+# End configuration section
+. ./utils/parse_options.sh  # accept options.. you can run this run.sh with the
+
+. ./path.sh
+
+echo >&2 "$0" "$@"
+if [ $# -ne 3 ] ; then
+  echo >&2 "$0" "$@"
+  echo >&2 "$0: Error: wrong number of arguments"
+  echo -e >&2 "Usage:\n  $0 [opts] <audio-dir> <json-transcript-dir> <output-dir>"
+  echo -e >&2 "eg:\n  $0 /corpora/chime5/audio/train /corpora/chime5/transcriptions/train data/train"
+  exit 1
+fi
+
+set -e -o pipefail
+
+adir=$1
+jdir=$2
+dir=$3
+
+json_count=$(find -L $jdir -name "*.json" | wc -l)
+wav_count=$(find -L $adir -name "*.wav" | wc -l)
+
+if [ "$json_count" -eq 0 ]; then
+  echo >&2 "We expect that the directory $jdir will contain json files."
+  echo >&2 "That implies you have supplied a wrong path to the data."
+  exit 1
+fi
+if [ "$wav_count" -eq 0 ]; then
+  echo >&2 "We expect that the directory $adir will contain wav files."
+  echo >&2 "That implies you have supplied a wrong path to the data."
+  exit 1
+fi
+
+echo "$0: Converting transcription to text"
+
+mkdir -p $dir
+for file in $jdir/*json; do
+  ./local/json2text.py --mictype $mictype $file
+done | \
+  sed -e "s/\[inaudible[- 0-9]*\]/[inaudible]/g" |\
+  sed -e 's/ - / /g' |\
+  sed -e 's/mm-/mm/g' > $dir/text.orig
+
+echo "$0: Creating datadir $dir for type=\"$mictype\""
+
+if [ $mictype == "worn" ]; then
+  # convert the filenames to wav.scp format, use the basename of the file
+  # as a the wav.scp key, add .L and .R for left and right channel
+  # i.e. each file will have two entries (left and right channel)
+  find -L $adir -name  "S[0-9]*_P[0-9]*.wav" | \
+    perl -ne '{
+      chomp;
+      $path = $_;
+      next unless $path;
+      @F = split "/", $path;
+      ($f = $F[@F-1]) =~ s/.wav//;
+      @F = split "_", $f;
+      print "${F[1]}_${F[0]}.L sox $path -t wav - remix 1 |\n";
+      print "${F[1]}_${F[0]}.R sox $path -t wav - remix 2 |\n";
+    }' | sort > $dir/wav.scp
+
+  # generate the transcripts for both left and right channel
+  # from the original transcript in the form
+  # P09_S03-0006072-0006147 gimme the baker
+  # create left and right channel transcript
+  # P09_S03.L-0006072-0006147 gimme the baker
+  # P09_S03.R-0006072-0006147 gimme the baker
+  sed -n 's/  *$//; h; s/-/\.L-/p; g; s/-/\.R-/p' $dir/text.orig | sort > $dir/text
+elif [ $mictype == "ref" ]; then
+  # fixed reference array
+  
+  # first get a text, which will be used to extract reference arrays
+  perl -ne 's/-/.ENH-/;print;' $dir/text.orig | sort > $dir/text
+
+  find -L $adir | grep "\.wav" | sort > $dir/wav.flist
+  # following command provide the argument for grep to extract only reference arrays
+  #grep `cut -f 1 -d"-" $dir/text | awk -F"_" '{print $2 "_" $3}' | sed -e "s/\.ENH//" | sort | uniq | sed -e "s/^/ -e /" | tr "\n" " "` $dir/wav.flist > $dir/wav.flist2
+  paste -d" " \
+	<(awk -F "/" '{print $NF}' $dir/wav.flist | sed -e "s/\.wav/.ENH/") \
+	$dir/wav.flist | sort > $dir/wav.scp
+else
+  # array mic case
+  # convert the filenames to wav.scp format, use the basename of the file
+  # as a the wav.scp key
+  find -L $adir -name "*.wav" -ipath "*${mictype}*" |\
+    perl -ne '$p=$_;chomp $_;@F=split "/";$F[$#F]=~s/\.wav//;print "$F[$#F] $p";' |\
+    sort -u > $dir/wav.scp
+
+  # convert the transcripts from
+  # P09_S03-0006072-0006147 gimme the baker
+  # to the per-channel transcripts
+  # P09_S03_U01_NOLOCATION.CH1-0006072-0006147 gimme the baker
+  # P09_S03_U01_NOLOCATION.CH2-0006072-0006147 gimme the baker
+  # P09_S03_U01_NOLOCATION.CH3-0006072-0006147 gimme the baker
+  # P09_S03_U01_NOLOCATION.CH4-0006072-0006147 gimme the baker
+  perl -ne '$l=$_;
+    for($i=1; $i<=4; $i++) {
+      ($x=$l)=~ s/-/.CH\Q$i\E-/;
+      print $x;}' $dir/text.orig | sort > $dir/text
+
+fi
+$cleanup && rm -f $dir/text.* $dir/wav.scp.* $dir/wav.flist
+
+# Prepare 'segments', 'utt2spk', 'spk2utt'
+if [ $mictype == "worn" ]; then
+  cut -d" " -f 1 $dir/text | \
+    awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
+    sed -e "s/_[A-Z]*\././2" \
+    > $dir/segments
+elif [ $mictype == "ref" ]; then
+  cut -d" " -f 1 $dir/text | \
+    awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
+    sed -e "s/_[A-Z]*\././2" |\
+    sed -e "s/ P.._/ /" > $dir/segments
+else
+  cut -d" " -f 1 $dir/text | \
+    awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
+    sed -e "s/_[A-Z]*\././2" |\
+    sed -e 's/ P.._/ /' > $dir/segments
+fi
+cut -f 1 -d ' ' $dir/segments | \
+  perl -ne 'chomp;$utt=$_;s/_.*//;print "$utt $_\n";' > $dir/utt2spk
+
+utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+
+if [ $train != 'true' ]; then
+  # For scoring the final system, we need the original utt2spk
+  # and text file. So we keep them with the extension .bak here
+  # so that they don't affect the validate_data_dir steps in
+  # the intermediate steps.
+  for file in text utt2spk spk2utt segments; do
+    mv $dir/$file $dir/$file.bak
+  done
+  
+  # For dev and eval data, prepare pseudo utt2spk.
+  awk '{print $1, $1}' $dir/wav.scp > $dir/utt2spk
+  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+fi
diff --git a/egs/chime6/s5b_track2/local/prepare_diarized_data.sh b/egs/chime6/s5b_track2/local/prepare_diarized_data.sh
new file mode 100755
index 00000000000..52468cb138b
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/prepare_diarized_data.sh
@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+# Copyright   2019   Ashish Arora, Vimal Manohar
+#             2020   Ivan Medennikov
+# Apache 2.0.
+# This script takes an rttm file, and prepares a diarized data directory.
+# The output directory contains a text file which can be used for scoring.
+
+stage=0
+nj=8
+cmd=run.pl
+echo "$0 $@"  # Print the command line for logging
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+if [ $# != 3 ]; then
+  echo "Usage: $0 <rttm-dir> <in-data-dir> <out-dir>"
+  echo "e.g.: $0 data/rttm data/dev data/dev_diarized"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs."
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  exit 1;
+fi
+
+rttm_dir=$1
+data_in=$2
+out_dir=$3
+
+for f in $rttm_dir/rttm $data_in/wav.scp; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+if [ $stage -le 0 ]; then
+  echo "$0 copying data files in output directory"
+  cp $rttm_dir/rttm $rttm_dir/rttm_1
+  sed -i 's/'.ENH'/''/g' $rttm_dir/rttm_1
+  # removing participant introduction from the hypothesis rttm
+  # UEM file contains the scoring durations for each recording
+  local/truncate_rttm.py $rttm_dir/rttm_1 local/uem_file $rttm_dir/rttm_introduction_removed
+  mkdir -p ${out_dir}_hires
+  cp ${data_in}/{wav.scp,utt2spk} ${out_dir}_hires
+  utils/data/get_reco2dur.sh ${out_dir}_hires
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0 creating segments file from rttm and utt2spk, reco2file_and_channel "
+  local/convert_rttm_to_utt2spk_and_segments.py --append-reco-id-to-spkr=true $rttm_dir/rttm_introduction_removed \
+    <(awk '{print $2".ENH "$2" "$3}' $rttm_dir/rttm_introduction_removed |sort -u) \
+    ${out_dir}_hires/utt2spk ${out_dir}_hires/segments
+
+  utils/utt2spk_to_spk2utt.pl ${out_dir}_hires/utt2spk > ${out_dir}_hires/spk2utt
+
+  awk '{print $1" "$1" 1"}' ${out_dir}_hires/wav.scp > ${out_dir}_hires/reco2file_and_channel
+  utils/fix_data_dir.sh ${out_dir}_hires || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0 extracting mfcc freatures using segments file"
+  steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd "$cmd" ${out_dir}_hires
+  steps/compute_cmvn_stats.sh ${out_dir}_hires
+  cp $data_in/text.bak ${out_dir}_hires/text
+fi
diff --git a/egs/chime6/s5b_track2/local/prepare_dict.sh b/egs/chime6/s5b_track2/local/prepare_dict.sh
new file mode 120000
index 00000000000..ada30947463
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/prepare_dict.sh
@@ -0,0 +1 @@
+../../s5_track1/local/prepare_dict.sh
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/prepare_gss_data.sh b/egs/chime6/s5b_track2/local/prepare_gss_data.sh
new file mode 100755
index 00000000000..78f8af91dd8
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/prepare_gss_data.sh
@@ -0,0 +1,37 @@
+#!/bin/bash -u
+# Copyright  2020    Prisyach Tatyana (STC-innovations Ltd)
+
+. ./utils/parse_options.sh
+. ./path.sh
+. ./cmd.sh
+
+echo >&2 "$0" "$@"
+if [ $# -ne 3 ] ; then
+  echo >&2 "$0" "$@"
+  echo >&2 "$0: Error: wrong number of arguments"
+  echo -e >&2 "Usage:\n  $0 <gss-dir> <output-data-dir>"
+  echo -e >&2 "eg:\n  $0 enhanced_dir/audio/dev data/dev_gss"
+  exit 1
+fi
+
+nj=8
+
+gss_dir=$1
+src_dir=$2
+dir=$3
+
+wav_list=$(find -L $gss_dir -name "*.wav" -printf "%f")
+if [ ! -d $dir ]; then
+  mkdir ${dir}
+fi
+echo $wav_list | awk -F ".wav" -v gss_dir=$gss_dir '{for (i=1; i<=NF; i++) {print $i" "gss_dir"/"$i".wav";}}' > $dir/wav_temp.scp
+sort -u $dir/wav_temp.scp > $dir/wav.scp
+rm -f $dir/wav_temp.scp
+cp $src_dir/utt2spk $dir/utt2spk
+
+utils/fix_data_dir.sh $dir
+
+echo "$0 extracting mfcc freatures using segments file"
+steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd "$decode_cmd" ${dir}
+steps/compute_cmvn_stats.sh ${dir}
+cp $src_dir/text ${dir}/text
diff --git a/egs/chime6/s5b_track2/local/print_dset_error.py b/egs/chime6/s5b_track2/local/print_dset_error.py
new file mode 100755
index 00000000000..8ffe930f4f6
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/print_dset_error.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+# Copyright   2019   Ashish Arora
+# Apache 2.0.
+
+import sys, io
+import string
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+spkorder_writer = open(sys.argv[1],'w', encoding='utf8')
+array_id_error_dict={}
+for line in infile:
+    toks = line.strip().split()
+    recordingid = toks[1]
+    total_words = toks[-5][:-1]
+    total_errors = toks[-4][:-1]
+    total_ins = toks[-3][:-1]
+    total_del = toks[-2][:-1]
+    total_sub = toks[-1]
+    spk_order = toks[6][1] + '_' + toks[7][0] + '_' + toks[8][0] + '_' + toks[9][0]
+    spkorder_writer.write(recordingid + ':' + spk_order + '\n')
+    arrayid=recordingid.strip().split('_')[1]
+    if arrayid not in array_id_error_dict:
+        array_id_error_dict[arrayid]=[0]*5
+    array_id_error_dict[arrayid][0]+=int(total_words)
+    array_id_error_dict[arrayid][1]+=int(total_errors)
+    array_id_error_dict[arrayid][2]+=int(total_ins)
+    array_id_error_dict[arrayid][3]+=int(total_del)
+    array_id_error_dict[arrayid][4]+=int(total_sub)
+
+
+for arrayid in sorted(array_id_error_dict):
+    wer = float(array_id_error_dict[arrayid][1])/float(array_id_error_dict[arrayid][0])*100
+    wer_detail = "%WER {0:5.2f} [ {1} / {2}, {3} ins, {4} del, {5} sub ]".format(wer, array_id_error_dict[arrayid][1], array_id_error_dict[arrayid][0], array_id_error_dict[arrayid][2], array_id_error_dict[arrayid][3], array_id_error_dict[arrayid][4])
+    output.write(arrayid + ' ' + wer_detail + '\n')
+
diff --git a/egs/chime6/s5b_track2/local/reverberate_lat_dir.sh b/egs/chime6/s5b_track2/local/reverberate_lat_dir.sh
new file mode 120000
index 00000000000..57302268f6d
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/reverberate_lat_dir.sh
@@ -0,0 +1 @@
+../../s5_track1/local/reverberate_lat_dir.sh
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/run_beamformit.sh b/egs/chime6/s5b_track2/local/run_beamformit.sh
new file mode 120000
index 00000000000..832a16e3ba7
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/run_beamformit.sh
@@ -0,0 +1 @@
+../../s5_track1/local/run_beamformit.sh
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/run_gss.sh b/egs/chime6/s5b_track2/local/run_gss.sh
new file mode 120000
index 00000000000..1711fb3f821
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/run_gss.sh
@@ -0,0 +1 @@
+../../s5_track1/local/run_gss.sh
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/run_ivector_common.sh b/egs/chime6/s5b_track2/local/run_ivector_common.sh
new file mode 120000
index 00000000000..df7fca84335
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/run_ivector_common.sh
@@ -0,0 +1 @@
+../../s5_track1/local/nnet3/run_ivector_common.sh
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/run_wpe.py b/egs/chime6/s5b_track2/local/run_wpe.py
new file mode 120000
index 00000000000..6621607c932
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/run_wpe.py
@@ -0,0 +1 @@
+../../s5_track1/local/run_wpe.py
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/run_wpe.sh b/egs/chime6/s5b_track2/local/run_wpe.sh
new file mode 120000
index 00000000000..187080e62e4
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/run_wpe.sh
@@ -0,0 +1 @@
+../../s5_track1/local/run_wpe.sh
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/score.sh b/egs/chime6/s5b_track2/local/score.sh
new file mode 120000
index 00000000000..6a200b42ed3
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/score.sh
@@ -0,0 +1 @@
+../steps/scoring/score_kaldi_wer.sh
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/score_for_submit.sh b/egs/chime6/s5b_track2/local/score_for_submit.sh
new file mode 100755
index 00000000000..71a3a4dd607
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/score_for_submit.sh
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+# Apache 2.0
+#
+# This script provides CHiME-6 challenge track 2 submission scores.
+# It calculates the best search parameter configurations by using the dev set
+# and provides wer for dev and eval
+
+cmd=run.pl
+stage=0
+word_ins_penalty=0.0,0.5,1.0
+min_lmwt=7
+max_lmwt=17
+dev_decodedir=exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_dereverb_diarized_2stage
+eval_decodedir=exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_eval_beamformit_dereverb_diarized_2stage
+dev_datadir=dev_beamformit_dereverb_diarized_hires
+eval_datadir=eval_beamformit_dereverb_diarized_hires
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    echo "Usage: $0 [--cmd (run.pl|queue.pl...)]"
+    echo "This script provides CHiME-6 challenge submission scores"
+    echo " Options:"
+    echo "    --cmd (run.pl|queue.pl...)            # specify how to run the sub-processes."
+    echo "    --dev_decodedir <dev-decode-dir>      # dev set decoding directory"
+    echo "    --eval_decodedir <eval-decode-dir>    # eval set decoding directory"
+    echo "    --dev_datadir <dev-data-dir>          # dev set data directory"
+    echo "    --eval_datadir <eval-data-dir>        # eval set data directory"
+    echo "    --min_lmwt <int>                      # minumum LM-weight for lattice rescoring "
+    echo "    --max_lmwt <int>                      # maximum LM-weight for lattice rescoring "
+    
+    exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  # obtaining multi speaker WER for all lmwt and wip
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    for LMWT in $(seq $min_lmwt $max_lmwt); do
+      local/multispeaker_score.sh --cmd "$cmd" \
+      --datadir $dev_datadir --get_stats false data/$dev_datadir/text \
+      $dev_decodedir/scoring_kaldi/penalty_$wip/$LMWT.txt \
+      $dev_decodedir/scoring_kaldi_multispeaker/penalty_$wip/$LMWT
+    done
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  # obtaining best lmwt, wip and wer
+  # adding /dev/null to the command list below forces grep to output the filename
+  mkdir -p $dev_decodedir/scoring_kaldi_multispeaker
+  grep WER $dev_decodedir/scoring_kaldi_multispeaker/penalty_*/*/per_speaker_wer/array_wer.txt /dev/null \
+    | utils/best_wer.sh >& $dev_decodedir/scoring_kaldi_multispeaker/best_wer
+
+  best_wer_file=$(awk '{print $NF}' $dev_decodedir/scoring_kaldi_multispeaker/best_wer)
+  best_array=$(echo $best_wer_file | awk -F: '{N=NF; print $N}')
+  best_lmwt=$(echo $best_wer_file | awk -F/ '{N=NF-2; print $N}')
+  best_wip=$(echo $best_wer_file | awk -F_ '{N=NF-3; print $N}' | awk -F/ '{N=NF-2; print $N}')
+
+  # printing and storing best lmwt, best_array and wip
+  echo "best array: $best_array"
+  echo "best LM weight: $best_lmwt"
+  echo "best insertion penalty weight: $best_wip"
+
+  echo $best_lmwt > $dev_decodedir/scoring_kaldi_multispeaker/lmwt
+  echo $best_wip >  $dev_decodedir/scoring_kaldi_multispeaker/wip
+  echo $best_array >  $dev_decodedir/scoring_kaldi_multispeaker/best_array
+fi
+
+if [ $stage -le 3 ]; then
+  # obtaining per utterance stats for dev
+  local/multispeaker_score.sh --cmd "$cmd" \
+    --datadir $dev_datadir data/$dev_datadir/text \
+    $dev_decodedir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \
+    $dev_decodedir/scoring_kaldi_multispeaker/penalty_$best_wip/$best_lmwt/
+fi
+
+if [ $stage -le 4 ]; then
+  # obtaining per utterance stats for eval
+  local/multispeaker_score.sh --cmd "$cmd" \
+    --datadir $eval_datadir data/$eval_datadir/text \
+    $eval_decodedir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \
+    $eval_decodedir/scoring_kaldi_multispeaker/penalty_$best_wip/$best_lmwt/
+fi
+
+if [ $stage -le 5 ]; then
+  # obtaining eval wer corresponding to best lmwt, best_array and wip of dev
+  best_array="$(cat $dev_decodedir/scoring_kaldi_multispeaker/best_array)"
+  best_lmwt="$(cat $dev_decodedir/scoring_kaldi_multispeaker/lmwt)"
+  best_wip="$(cat $dev_decodedir/scoring_kaldi_multispeaker/wip)"
+
+  grep WER $eval_decodedir/scoring_kaldi_multispeaker/penalty_$best_wip/$best_lmwt/per_speaker_wer/array_wer.txt /dev/null \
+    | grep $best_array | utils/best_wer.sh >& $eval_decodedir/scoring_kaldi_multispeaker/best_wer
+
+  # printing dev and eval wer
+  echo "Dev:  $(<$dev_decodedir/scoring_kaldi_multispeaker/best_wer)" | cut -d " " -f 1-15
+  echo "Eval: $(<$eval_decodedir/scoring_kaldi_multispeaker/best_wer)" | cut -d " " -f 1-14
+fi
+
diff --git a/egs/chime6/s5b_track2/local/segmentation/detect_speech_activity.sh b/egs/chime6/s5b_track2/local/segmentation/detect_speech_activity.sh
new file mode 100755
index 00000000000..c9719d472f3
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/segmentation/detect_speech_activity.sh
@@ -0,0 +1,217 @@
+#!/usr/bin/env bash
+
+# Copyright 2016-17  Vimal Manohar
+#              2017  Nagendra Kumar Goel
+# Apache 2.0.
+
+# This script does nnet3-based speech activity detection given an input 
+# kaldi data directory and outputs a segmented kaldi data directory.
+# This script can also do music detection and other similar segmentation
+# using appropriate options such as --output-name output-music.
+
+set -e 
+set -o pipefail
+set -u
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+
+affix=  # Affix for the segmentation
+nj=32
+cmd=queue.pl
+stage=-1
+
+# Feature options (Must match training)
+mfcc_config=conf/mfcc_hires.conf
+feat_affix=   # Affix for the type of feature used
+
+output_name=output   # The output node in the network
+sad_name=sad    # Base name for the directory storing the computed loglikes
+                # Can be music for music detection
+segmentation_name=segmentation  # Base name for the directory doing segmentation
+                                # Can be segmentation_music for music detection
+
+# SAD network config
+iter=final  # Model iteration to use
+
+# Contexts must ideally match training for LSTM models, but
+# may not necessarily for stats components
+extra_left_context=0  # Set to some large value, typically 40 for LSTM (must match training)
+extra_right_context=0  
+extra_left_context_initial=-1
+extra_right_context_final=-1
+frames_per_chunk=150
+
+# Decoding options
+graph_opts="--min-silence-duration=0.03 --min-speech-duration=0.3 --max-speech-duration=10.0"
+acwt=0.3
+
+# These <from>_in_<to>_weight represent the fraction of <from> probability 
+# to transfer to <to> class.
+# e.g. --speech-in-sil-weight=0.0 --garbage-in-sil-weight=0.0 --sil-in-speech-weight=0.0 --garbage-in-speech-weight=0.3
+transform_probs_opts=""
+
+# Postprocessing options
+segment_padding=0.2   # Duration (in seconds) of padding added to segments 
+min_segment_dur=0   # Minimum duration (in seconds) required for a segment to be included
+                    # This is before any padding. Segments shorter than this duration will be removed.
+                    # This is an alternative to --min-speech-duration above.
+merge_consecutive_max_dur=0   # Merge consecutive segments as long as the merged segment is no longer than this many
+                              # seconds. The segments are only merged if their boundaries are touching.
+                              # This is after padding by --segment-padding seconds.
+                              # 0 means do not merge. Use 'inf' to not limit the duration.
+
+echo $* 
+
+. utils/parse_options.sh
+
+if [ $# -ne 5 ]; then
+  echo "This script does nnet3-based speech activity detection given an input kaldi "
+  echo "data directory and outputs an output kaldi data directory."
+  echo "See script for details of the options to be supplied."
+  echo "Usage: $0 <src-data-dir> <sad-nnet-dir> <mfcc-dir> <work-dir> <out-data-dir>"
+  echo " e.g.: $0 ~/workspace/egs/ami/s5b/data/sdm1/dev exp/nnet3_sad_snr/nnet_tdnn_j_n4 \\"
+  echo "    mfcc_hires exp/segmentation_sad_snr/nnet_tdnn_j_n4 data/ami_sdm1_dev"
+  echo ""
+  echo "Options: "
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --nj <num-job>                                 # number of parallel jobs to run."
+  echo "  --stage <stage>                                # stage to do partial re-run from."
+  echo "  --convert-data-dir-to-whole <true|false>    # If true, the input data directory is "
+  echo "                                              # first converted to whole data directory (i.e. whole recordings) "
+  echo "                                              # and segmentation is done on that."
+  echo "                                              # If false, then the original segments are "
+  echo "                                              # retained and they are split into sub-segments."
+  echo "  --output-name <name>    # The output node in the network"
+  echo "  --extra-left-context  <context|0>   # Set to some large value, typically 40 for LSTM (must match training)"
+  echo "  --extra-right-context  <context|0>   # For BLSTM or statistics pooling"
+  exit 1
+fi
+
+src_data_dir=$1   # The input data directory that needs to be segmented.
+                  # If convert_data_dir_to_whole is true, any segments in that will be ignored.
+sad_nnet_dir=$2   # The SAD neural network
+mfcc_dir=$3       # The directory to store the features
+dir=$4            # Work directory
+data_dir=$5       # The output data directory will be ${data_dir}_seg
+
+affix=${affix:+_$affix}
+feat_affix=${feat_affix:+_$feat_affix}
+
+data_id=`basename $data_dir`
+sad_dir=${dir}/${sad_name}${affix}_${data_id}${feat_affix}
+seg_dir=${dir}/${segmentation_name}${affix}_${data_id}${feat_affix}
+test_data_dir=data/${data_id}${feat_affix}
+
+###############################################################################
+## Forward pass through the network network and dump the log-likelihoods.
+###############################################################################
+
+frame_subsampling_factor=1
+if [ -f $sad_nnet_dir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $sad_nnet_dir/frame_subsampling_factor)
+fi
+
+mkdir -p $dir
+if [ $stage -le 1 ]; then
+  if [ "$(readlink -f $sad_nnet_dir)" != "$(readlink -f $dir)" ]; then
+    cp $sad_nnet_dir/cmvn_opts $dir || exit 1
+  fi
+
+  ########################################################################
+  ## Initialize neural network for decoding using the output $output_name
+  ########################################################################
+
+  if [ ! -z "$output_name" ] && [ "$output_name" != output ]; then
+    $cmd $dir/log/get_nnet_${output_name}.log \
+      nnet3-copy --edits="rename-node old-name=$output_name new-name=output" \
+      $sad_nnet_dir/$iter.raw $dir/${iter}_${output_name}.raw || exit 1
+    iter=${iter}_${output_name}
+  else 
+    if ! diff $sad_nnet_dir/$iter.raw $dir/$iter.raw; then
+      cp $sad_nnet_dir/$iter.raw $dir/
+    fi
+  fi
+
+  steps/nnet3/compute_output.sh --nj $nj --cmd "$cmd" \
+    --iter ${iter} \
+    --extra-left-context $extra_left_context \
+    --extra-right-context $extra_right_context \
+    --extra-left-context-initial $extra_left_context_initial \
+    --extra-right-context-final $extra_right_context_final \
+    --frames-per-chunk $frames_per_chunk --apply-exp true \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    ${test_data_dir} $dir $sad_dir || exit 1
+fi
+
+###############################################################################
+## Prepare FST we search to make speech/silence decisions.
+###############################################################################
+
+utils/data/get_utt2dur.sh --nj $nj --cmd "$cmd" $test_data_dir || exit 1
+frame_shift=$(utils/data/get_frame_shift.sh $test_data_dir) || exit 1
+
+graph_dir=${dir}/graph_${output_name}
+if [ $stage -le 2 ]; then
+  mkdir -p $graph_dir
+
+  # 1 for silence and 2 for speech
+  cat <<EOF > $graph_dir/words.txt
+<eps> 0
+silence 1
+speech 2
+EOF
+
+  $cmd $graph_dir/log/make_graph.log \
+    steps/segmentation/internal/prepare_sad_graph.py $graph_opts \
+      --frame-shift=$(perl -e "print $frame_shift * $frame_subsampling_factor") - \| \
+    fstcompile --isymbols=$graph_dir/words.txt --osymbols=$graph_dir/words.txt '>' \
+      $graph_dir/HCLG.fst
+fi
+
+###############################################################################
+## Do Viterbi decoding to create per-frame alignments.
+###############################################################################
+
+post_vec=$sad_nnet_dir/post_${output_name}.vec
+if [ ! -f $sad_nnet_dir/post_${output_name}.vec ]; then
+  if [ ! -f $sad_nnet_dir/post_${output_name}.txt ]; then
+    echo "$0: Could not find $sad_nnet_dir/post_${output_name}.vec. "
+    echo "Re-run the corresponding stage in the training script possibly "
+    echo "with --compute-average-posteriors=true or compute the priors "
+    echo "from the training labels"
+    exit 1
+  else
+    post_vec=$sad_nnet_dir/post_${output_name}.txt
+  fi
+fi
+
+mkdir -p $seg_dir
+if [ $stage -le 3 ]; then
+  steps/segmentation/internal/get_transform_probs_mat.py \
+    --priors="$post_vec" $transform_probs_opts > $seg_dir/transform_probs.mat
+
+  steps/segmentation/decode_sad.sh --acwt $acwt --cmd "$cmd" \
+    --nj $nj \
+    --transform "$seg_dir/transform_probs.mat" \
+    $graph_dir $sad_dir $seg_dir
+fi
+
+###############################################################################
+## Post-process segmentation to create kaldi data directory.
+###############################################################################
+
+if [ $stage -le 4 ]; then
+  steps/segmentation/post_process_sad_to_segments.sh \
+    --segment-padding $segment_padding --min-segment-dur $min_segment_dur \
+    --merge-consecutive-max-dur $merge_consecutive_max_dur \
+    --cmd "$cmd" --frame-shift $(perl -e "print $frame_subsampling_factor * $frame_shift") \
+    ${test_data_dir} ${seg_dir} ${seg_dir}
+fi
+
+if [ $stage -le 5 ]; then
+  utils/data/subsegment_data_dir.sh ${test_data_dir} ${seg_dir}/segments \
+    ${data_dir}_seg
+fi
+
+echo "$0: Created output segmented kaldi data directory in ${data_dir}_seg"
+exit 0
diff --git a/egs/chime6/s5b_track2/local/segmentation/tuning/train_lstm_sad_1a.sh b/egs/chime6/s5b_track2/local/segmentation/tuning/train_lstm_sad_1a.sh
new file mode 100755
index 00000000000..7ea39f45639
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/segmentation/tuning/train_lstm_sad_1a.sh
@@ -0,0 +1,140 @@
+#!/usr/bin/env bash
+
+# Copyright 2017   Nagendra Kumar Goel
+#           2018   Vimal Manohar
+# Apache 2.0
+
+# This is a script to train a TDNN for speech activity detection (SAD) 
+# using LSTM for long-context information.
+
+stage=0
+train_stage=-10
+get_egs_stage=-10
+egs_opts=
+
+chunk_width=20
+
+extra_left_context=60
+extra_right_context=10
+relu_dim=256
+cell_dim=256
+projection_dim=64
+
+# training options
+num_epochs=1
+initial_effective_lrate=0.0003
+final_effective_lrate=0.00003
+num_jobs_initial=3
+num_jobs_final=8
+remove_egs=true
+max_param_change=0.2  # Small max-param change for small network
+dropout_schedule='0,0@0.20,0.1@0.50,0'
+
+egs_dir=
+nj=40
+
+dir=
+affix=1a
+
+data_dir=
+targets_dir=
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./utils/parse_options.sh
+
+set -o pipefail
+set -u
+
+if [ -z "$dir" ]; then
+  dir=exp/segmentation_1a/tdnn_lstm_asr_sad
+fi
+dir=$dir${affix:+_$affix}
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+mkdir -p $dir
+
+samples_per_iter=`perl -e "print int(400000 / $chunk_width)"`
+cmvn_opts="--norm-means=false --norm-vars=false"
+echo $cmvn_opts > $dir/cmvn_opts
+
+if [ $stage -le 5 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=`feat-to-dim scp:$data_dir/feats.scp -` name=input
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2) affine-transform-file=$dir/configs/lda.mat 
+
+  relu-renorm-layer name=tdnn1 input=lda dim=$relu_dim add-log-stddev=true
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$relu_dim add-log-stddev=true
+  relu-renorm-layer name=tdnn3 input=Append(-3,0,3,6) dim=$relu_dim add-log-stddev=true
+  fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim decay-time=20 delay=-3 dropout-proportion=0.0
+  relu-renorm-layer name=tdnn4 input=Append(-6,0,6,12) add-log-stddev=true dim=$relu_dim
+  fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim decay-time=20 delay=-3 dropout-proportion=0.0
+  relu-renorm-layer name=tdnn5 input=Append(-12,0,12,24) dim=$relu_dim
+
+  output-layer name=output include-log-softmax=true dim=3 learning-rate-factor=0.1 input=tdnn5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig \
+    --config-dir $dir/configs/
+
+  cat <<EOF >> $dir/configs/vars
+num_targets=3
+EOF
+fi
+
+if [ $stage -le 6 ]; then
+  num_utts=`cat $data_dir/utt2spk | wc -l`
+  # Set num_utts_subset for diagnostics to a reasonable value
+  # of max(min(0.005 * num_utts, 300), 12)
+  num_utts_subset=`perl -e '$n=int($ARGV[0] * 0.005); print ($n > 300 ? 300 : ($n < 12 ? 12 : $n))' $num_utts`
+
+  steps/nnet3/train_raw_rnn.py --stage=$train_stage \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$egs_dir" --egs.stage=$get_egs_stage \
+    --egs.chunk-left-context=$extra_left_context \
+    --egs.chunk-right-context=$extra_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.samples-per-iter=20000 \
+    --trainer.optimization.num-jobs-initial=$num_jobs_initial \
+    --trainer.optimization.num-jobs-final=$num_jobs_final \
+    --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate=$final_effective_lrate \
+    --trainer.optimization.shrink-value=0.99 \
+    --trainer.dropout-schedule="$dropout_schedule" \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.max-param-change=$max_param_change \
+    --trainer.compute-per-dim-accuracy=true \
+    --cmd="$decode_cmd" --nj $nj \
+    --cleanup=true \
+    --cleanup.remove-egs=$remove_egs \
+    --cleanup.preserve-model-interval=10 \
+    --use-gpu=true \
+    --use-dense-targets=true \
+    --feat-dir=$data_dir \
+    --targets-scp="$targets_dir/targets.scp" \
+    --egs.opts="--frame-subsampling-factor 3 --num-utts-subset $num_utts_subset" \
+    --dir=$dir || exit 1
+fi
+
+if [ $stage -le 7 ]; then
+  # Use a subset to compute prior over the output targets
+  $train_cmd $dir/log/get_priors.log \
+    matrix-sum-rows "scp:utils/subset_scp.pl --quiet 1000 $targets_dir/targets.scp |" \
+    ark:- \| vector-sum --binary=false ark:- $dir/post_output.vec || exit 1
+
+  echo 3 > $dir/frame_subsampling_factor
+fi
diff --git a/egs/chime6/s5b_track2/local/segmentation/tuning/train_stats_sad_1a.sh b/egs/chime6/s5b_track2/local/segmentation/tuning/train_stats_sad_1a.sh
new file mode 100755
index 00000000000..83bcd587d88
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/segmentation/tuning/train_stats_sad_1a.sh
@@ -0,0 +1,150 @@
+#!/usr/bin/env bash
+
+# Copyright 2017   Nagendra Kumar Goel
+#           2018   Vimal Manohar
+# Apache 2.0
+
+# This is a script to train a TDNN for speech activity detection (SAD) 
+# using statistics pooling for long-context information.
+
+stage=0
+train_stage=-10
+get_egs_stage=-10
+egs_opts=
+
+chunk_width=20
+
+# The context is chosen to be around 1 second long. The context at test time
+# is expected to be around the same.
+extra_left_context=79
+extra_right_context=21
+
+relu_dim=256
+
+# training options
+num_epochs=1
+initial_effective_lrate=0.0003
+final_effective_lrate=0.00003
+num_jobs_initial=3
+num_jobs_final=8
+remove_egs=true
+max_param_change=0.2  # Small max-param change for small network
+
+egs_dir=
+nj=40
+
+dir=
+affix=1a
+
+data_dir=
+targets_dir=
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./utils/parse_options.sh
+
+set -o pipefail
+set -u
+
+if [ -z "$dir" ]; then
+  dir=exp/segmentation_1a/tdnn_stats_sad
+fi
+dir=$dir${affix:+_$affix}
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+mkdir -p $dir
+
+samples_per_iter=`perl -e "print int(400000 / $chunk_width)"`
+cmvn_opts="--norm-means=false --norm-vars=false"
+echo $cmvn_opts > $dir/cmvn_opts
+
+if [ $stage -le 5 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=`feat-to-dim scp:$data_dir/feats.scp -` name=input
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2) affine-transform-file=$dir/configs/lda.mat 
+
+  relu-renorm-layer name=tdnn1 input=lda dim=$relu_dim add-log-stddev=true
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$relu_dim add-log-stddev=true
+  relu-renorm-layer name=tdnn3 input=Append(-3,0,3,6) dim=$relu_dim add-log-stddev=true
+  stats-layer name=tdnn3_stats config=mean+count(-99:3:9:99)
+  relu-renorm-layer name=tdnn4 input=Append(tdnn3@-6,tdnn3@0,tdnn3@6,tdnn3@12,tdnn3_stats) add-log-stddev=true dim=$relu_dim
+  stats-layer name=tdnn4_stats config=mean+count(-108:6:18:108)
+  relu-renorm-layer name=tdnn5 input=Append(tdnn4@-12,tdnn4@0,tdnn4@12,tdnn4@24,tdnn4_stats) dim=$relu_dim
+
+  output-layer name=output include-log-softmax=true dim=3 learning-rate-factor=0.1 input=tdnn5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig \
+    --config-dir $dir/configs/
+
+  cat <<EOF >> $dir/configs/vars
+num_targets=3
+EOF
+fi
+
+if [ $stage -le 6 ]; then
+  num_utts=`cat $data_dir/utt2spk | wc -l`
+  # Set num_utts_subset for diagnostics to a reasonable value
+  # of max(min(0.005 * num_utts, 300), 12)
+  num_utts_subset=`perl -e '$n=int($ARGV[0] * 0.005); print ($n > 300 ? 300 : ($n < 12 ? 12 : $n))' $num_utts`
+
+  steps/nnet3/train_raw_rnn.py --stage=$train_stage \
+    --feat.cmvn-opts=$cmvn_opts \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$egs_dir" --egs.stage=$get_egs_stage \
+    --egs.chunk-left-context=$extra_left_context \
+    --egs.chunk-right-context=$extra_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.samples-per-iter=20000 \
+    --trainer.optimization.num-jobs-initial=$num_jobs_initial \
+    --trainer.optimization.num-jobs-final=$num_jobs_final \
+    --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate=$final_effective_lrate \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.max-param-change=$max_param_change \
+    --trainer.compute-per-dim-accuracy=true \
+    --cmd="$decode_cmd" --nj $nj \
+    --cleanup=true \
+    --cleanup.remove-egs=$remove_egs \
+    --cleanup.preserve-model-interval=10 \
+    --use-gpu=true \
+    --use-dense-targets=true \
+    --feat-dir=$data_dir \
+    --targets-scp="$targets_dir/targets.scp" \
+    --egs.opts="--frame-subsampling-factor 3 --num-utts-subset $num_utts_subset" \
+    --dir=$dir || exit 1
+fi
+
+if [ $stage -le 7 ]; then
+  # Use a subset to compute prior over the output targets
+  #$train_cmd $dir/log/get_priors.log \
+  #  matrix-sum-rows "scp:utils/subset_scp.pl --quiet 1000 $targets_dir/targets.scp |" \
+  #  ark:- \| vector-sum --binary=false ark:- $dir/post_output.vec || exit 1
+
+  # Since the train data is individual microphones, while the dev and
+  # eval are beamformed, it is likely that the train contains a much
+  # higher ratio of silences. So using priors computed from the train
+  # data may miss a lot of speech in the dev/eval sets. Hence we manually
+  # tune the prior on the dev set.
+  # With the following prior, the SAD system results are:
+  # Dev (using -c 0.25)
+  # MISSED SPEECH =   1188.59 secs (  3.3 percent of scored time)
+  # FALARM SPEECH =    539.37 secs (  1.5 percent of scored time)
+  echo "[ 30 2 1 ]" > $dir/post_output.vec || exit 1
+
+  echo 3 > $dir/frame_subsampling_factor
+fi
+
diff --git a/egs/chime6/s5b_track2/local/train_diarizer.sh b/egs/chime6/s5b_track2/local/train_diarizer.sh
new file mode 100755
index 00000000000..845ac7840d5
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/train_diarizer.sh
@@ -0,0 +1,186 @@
+#!/usr/bin/env bash
+# Copyright
+#        2019   David Snyder
+# Apache 2.0.
+#
+# This script is based on the run.sh script in the Voxceleb v2 recipe.
+# It trains an x-vector DNN for diarization.
+
+mfccdir=`pwd`/mfcc
+vaddir=`pwd`/mfcc
+
+voxceleb1_root=/export/corpora/VoxCeleb1
+voxceleb2_root=/export/corpora/VoxCeleb2
+data_dir=train_worn_simu_u400k
+model_dir=exp/xvector_nnet_1a
+
+stage=0
+train_stage=-1
+
+. ./cmd.sh
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+set -e -u -o pipefail
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+  exit 1
+fi
+
+if [ $stage -le 0 ]; then
+  echo "$0: preparing voxceleb 2 data"
+  local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
+  local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
+
+  echo "$0: preparing voxceleb 1 data (see comments if this step fails)"
+  # The format of the voxceleb 1 corpus has changed several times since it was
+  # released.  Therefore, our dataprep scripts may or may not fail depending
+  # on the version of the corpus you obtained.
+  # If you downloaded the corpus soon after it was first released, this
+  # version of the dataprep script might work:
+  local/make_voxceleb1.pl $voxceleb1_root data/voxceleb1
+  # However, if you've downloaded the corpus recently, you may need to use the
+  # the following scripts instead:
+  #local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
+  #local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test
+
+  # We should now have about 7,351 speakers and 1,277,503 utterances.
+  utils/combine_data.sh data/voxceleb data/voxceleb2_train data/voxceleb2_test
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: preparing features for training data (voxceleb 1 + 2)"
+  steps/make_mfcc.sh --write-utt2num-frames true \
+    --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \
+    data/voxceleb exp/make_mfcc $mfccdir
+  utils/fix_data_dir.sh data/voxceleb
+  # Note that we apply CMN to the MFCCs and write these to the disk.  These
+  # features will later be used to train the x-vector DNN.
+fi
+
+# In this section, we augment the voxceleb data with reverberation.
+# Note that we can probably improve the x-vector DNN if we include
+# augmentations from the nonspeech regions of the Chime 6 training
+# dataset.
+if [ $stage -le 2 ]; then
+  echo "$0: applying augmentation to x-vector training data (just reverb for now)"
+  frame_shift=0.01
+  awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/voxceleb/utt2num_frames > data/voxceleb/reco2dur
+
+  if [ ! -d "RIRS_NOISES" ]; then
+    echo "$0: downloading simulated room impulse response dataset"
+    # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+    unzip rirs_noises.zip
+  fi
+
+  # Make a version with reverberated speech
+  rvb_opts=()
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
+
+  # Make a reverberated version of the training data.  Note that we don't add any
+  # additive noise here.
+  steps/data/reverberate_data_dir.py \
+    "${rvb_opts[@]}" \
+    --speech-rvb-probability 1 \
+    --pointsource-noise-addition-probability 0 \
+    --isotropic-noise-addition-probability 0 \
+    --num-replications 1 \
+    --source-sampling-rate 16000 \
+    data/voxceleb data/voxceleb_reverb
+  utils/copy_data_dir.sh --utt-suffix "-reverb" data/voxceleb_reverb data/voxceleb_reverb.new
+  rm -rf data/voxceleb_reverb
+  mv data/voxceleb_reverb.new data/voxceleb_reverb
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: making MFCCs for augmented training data"
+  # Make MFCCs for the augmented data.  Note that we do not compute a new
+  # vad.scp file here.  Instead, we use the vad.scp from the clean version of
+  # the list.
+  steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \
+    data/voxceleb_reverb exp/make_mfcc $mfccdir
+  # Combine the clean and augmented training data.  This is now roughly
+  # double the size of the original clean list.
+  utils/combine_data.sh data/voxceleb_combined data/voxceleb_reverb data/voxceleb
+fi
+
+# Now we prepare the features to generate examples for xvector training.
+if [ $stage -le 4 ]; then
+  # This script applies CMVN and removes nonspeech frames.  Note that this is somewhat
+  # wasteful, as it roughly doubles the amount of training data on disk.  After
+  # creating voxceleb examples, this can be removed.
+  echo "$0: preparing features to train x-vector DNN"
+  local/nnet3/xvector/prepare_feats.sh --nj 40 --cmd "$train_cmd" \
+    data/voxceleb_combined data/voxceleb_combined_cmn exp/voxceleb_combined_cmn
+  utils/fix_data_dir.sh data/voxceleb_combined_cmn
+fi
+
+if [ $stage -le 5 ]; then
+  # Now, we need to remove features that are too short after removing silence
+  # frames.  We want at least 4s (400 frames) per utterance.
+  min_len=400
+  mv data/voxceleb_combined_cmn/utt2num_frames data/voxceleb_combined_cmn/utt2num_frames.bak
+  awk -v min_len=${min_len} '$2 > min_len {print $1, $2}' data/voxceleb_combined_cmn/utt2num_frames.bak > data/voxceleb_combined_cmn/utt2num_frames
+  utils/filter_scp.pl data/voxceleb_combined_cmn/utt2num_frames data/voxceleb_combined_cmn/utt2spk > data/voxceleb_combined_cmn/utt2spk.new
+  mv data/voxceleb_combined_cmn/utt2spk.new data/voxceleb_combined_cmn/utt2spk
+  utils/fix_data_dir.sh data/voxceleb_combined_cmn
+
+  # We also want several utterances per speaker. Now we'll throw out speakers
+  # with fewer than 8 utterances.
+  min_num_utts=8
+  awk '{print $1, NF-1}' data/voxceleb_combined_cmn/spk2utt > data/voxceleb_combined_cmn/spk2num
+  awk -v min_num_utts=${min_num_utts} '$2 >= min_num_utts {print $1, $2}' data/voxceleb_combined_cmn/spk2num | utils/filter_scp.pl - data/voxceleb_combined_cmn/spk2utt > data/voxceleb_combined_cmn/spk2utt.new
+  mv data/voxceleb_combined_cmn/spk2utt.new data/voxceleb_combined_cmn/spk2utt
+  utils/spk2utt_to_utt2spk.pl data/voxceleb_combined_cmn/spk2utt > data/voxceleb_combined_cmn/utt2spk
+
+  utils/filter_scp.pl data/voxceleb_combined_cmn/utt2spk data/voxceleb_combined_cmn/utt2num_frames > data/voxceleb_combined_cmn/utt2num_frames.new
+  mv data/voxceleb_combined_cmn/utt2num_frames.new data/voxceleb_combined_cmn/utt2num_frames
+
+  utils/fix_data_dir.sh data/voxceleb_combined_cmn
+fi
+
+# Stages 6 through 8 are handled in run_xvector.sh.
+# This script trains the x-vector DNN on the augmented voxceleb data.
+local/nnet3/xvector/run_xvector.sh --stage $stage --train-stage $train_stage \
+  --data data/voxceleb_combined_cmn --nnet-dir $model_dir \
+  --egs-dir $model_dir/egs
+
+if [ $stage -le 9 ]; then
+  echo "$0: preparing a subset of Chime 6 training data to train PLDA model"
+  utils/subset_data_dir.sh ${data_dir} 100000 data/plda_train
+  steps/make_mfcc.sh --write-utt2num-frames true \
+    --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \
+    data/plda_train exp/make_mfcc $mfccdir
+  utils/fix_data_dir.sh data/plda_train
+  local/nnet3/xvector/prepare_feats.sh --nj 40 --cmd "$train_cmd" \
+    data/plda_train data/plda_train_cmn exp/plda_train_cmn
+  if [ -f data/plda_train/segments ]; then
+    cp data/plda_train/segments data/plda_train_cmn/
+  fi
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: extracting x-vector for PLDA training data"
+  utils/fix_data_dir.sh data/plda_train_cmn
+  diarization/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 10G" \
+    --nj 40 --window 3.0 --period 10.0 --min-segment 1.5 --apply-cmn false \
+    --hard-min true $model_dir \
+    data/plda_train_cmn $model_dir/xvectors_plda_train
+fi
+
+# Train PLDA models
+if [ $stage -le 11 ]; then
+  echo "$0: training PLDA model"
+  $train_cmd $model_dir/xvectors_plda_train/log/plda.log \
+    ivector-compute-plda ark:$model_dir/xvectors_plda_train/spk2utt \
+      "ark:ivector-subtract-global-mean \
+      scp:$model_dir/xvectors_plda_train/xvector.scp ark:- \
+      | transform-vec $model_dir/xvectors_plda_train/transform.mat ark:- ark:- \
+      | ivector-normalize-length ark:- ark:- |" \
+      $model_dir/xvectors_plda_train/plda || exit 1;
+  cp $model_dir/xvectors_plda_train/plda $model_dir/
+  cp $model_dir/xvectors_plda_train/transform.mat $model_dir/
+  cp $model_dir/xvectors_plda_train/mean.vec $model_dir/
+fi
diff --git a/egs/chime6/s5b_track2/local/train_lms_srilm.sh b/egs/chime6/s5b_track2/local/train_lms_srilm.sh
new file mode 120000
index 00000000000..a7666f6cded
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/train_lms_srilm.sh
@@ -0,0 +1 @@
+../../s5_track1/local/train_lms_srilm.sh
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/train_sad.sh b/egs/chime6/s5b_track2/local/train_sad.sh
new file mode 100755
index 00000000000..cbaf3dfc5de
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/train_sad.sh
@@ -0,0 +1,155 @@
+#!/usr/bin/env bash
+
+# Copyright  2017  Nagendra Kumar Goel
+#            2017  Vimal Manohar
+#            2019  Desh Raj
+# Apache 2.0
+
+# This script is based on local/run_asr_segmentation.sh script in the
+# Aspire recipe. It demonstrates nnet3-based speech activity detection for
+# segmentation.
+# This script:
+# 1) Prepares targets (per-frame labels) for a subset of training data 
+#    using GMM models
+# 2) Trains TDNN+Stats or TDNN+LSTM neural network using the targets 
+# 3) Demonstrates using the SAD system to get segments of dev data
+
+lang=data/lang   # Must match the one used to train the models
+lang_test=data/lang_test  # Lang directory for decoding.
+
+data_dir=
+test_sets=
+# Model directory used to align the $data_dir to get target labels for training
+# SAD. This should typically be a speaker-adapted system.
+sat_model_dir=
+# Model direcotry used to decode the whole-recording version of the $data_dir to
+# get target labels for training SAD. This should typically be a
+# speaker-independent system like LDA+MLLT system.
+model_dir=
+graph_dir=                  # Graph for decoding whole-recording version of $data_dir.
+                            # If not provided, a new one will be created using $lang_test
+
+# List of weights on labels obtained from alignment;
+# labels obtained from decoding; and default labels in out-of-segment regions
+merge_weights=1.0,0.1,0.5
+
+prepare_targets_stage=-10
+nstage=-10
+train_stage=-10
+stage=0
+nj=50
+reco_nj=40
+
+# test options
+test_nj=10
+
+. ./cmd.sh
+. ./conf/sad.conf
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+
+set -e -u -o pipefail
+. utils/parse_options.sh 
+
+if [ $# -ne 0 ]; then
+  exit 1
+fi
+
+dir=exp/segmentation${affix}
+sad_work_dir=exp/sad${affix}_${nnet_type}/
+sad_nnet_dir=$dir/tdnn_${nnet_type}_sad_1a
+
+mkdir -p $dir
+mkdir -p ${sad_work_dir}
+
+# See $lang/phones.txt and decide which should be garbage
+garbage_phones="laughs inaudible"
+silence_phones="sil spn noise"
+
+for p in $garbage_phones; do 
+  for a in "" "_B" "_E" "_I" "_S"; do
+    echo "$p$a"
+  done
+done > $dir/garbage_phones.txt
+
+for p in $silence_phones; do 
+  for a in "" "_B" "_E" "_I" "_S"; do
+    echo "$p$a"
+  done
+done > $dir/silence_phones.txt
+
+if ! cat $dir/garbage_phones.txt $dir/silence_phones.txt | \
+  steps/segmentation/internal/verify_phones_list.py $lang/phones.txt; then
+  echo "$0: Invalid $dir/{silence,garbage}_phones.txt"
+  exit 1
+fi
+
+# The training data may already be segmented, so we first prepare
+# a "whole" training data (not segmented) for training the SAD
+# system.
+
+whole_data_dir=${data_dir}_whole
+whole_data_id=$(basename $whole_data_dir)
+
+if [ $stage -le 0 ]; then
+  utils/data/convert_data_dir_to_whole.sh $data_dir $whole_data_dir
+fi
+
+###############################################################################
+# Extract features for the whole data directory. We extract 13-dim MFCCs to
+# generate targets using the GMM system, and 40-dim MFCCs to train the NN-based
+# SAD.
+###############################################################################
+if [ $stage -le 1 ]; then
+  steps/make_mfcc.sh --nj $reco_nj --cmd "$train_cmd"  --write-utt2num-frames true \
+    --mfcc-config conf/mfcc.conf \
+    $whole_data_dir exp/make_mfcc/${whole_data_id}
+  steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/${whole_data_id}
+  utils/fix_data_dir.sh $whole_data_dir
+
+  utils/copy_data_dir.sh $whole_data_dir ${whole_data_dir}_hires
+  steps/make_mfcc.sh --nj $reco_nj --cmd "$train_cmd"  --write-utt2num-frames true \
+    --mfcc-config conf/mfcc_hires.conf \
+    ${whole_data_dir}_hires exp/make_mfcc/${whole_data_id}_hires
+  steps/compute_cmvn_stats.sh ${whole_data_dir}_hires exp/make_mfcc/${whole_data_id}_hires
+  utils/fix_data_dir.sh ${whole_data_dir}_hires
+fi
+
+###############################################################################
+# Prepare SAD targets for recordings
+###############################################################################
+targets_dir=$dir/${whole_data_id}_combined_targets_sub3
+if [ $stage -le 2 ]; then
+  steps/segmentation/prepare_targets_gmm.sh --stage $prepare_targets_stage \
+    --train-cmd "$train_cmd" --decode-cmd "$decode_cmd" \
+    --nj $nj --reco-nj $reco_nj --lang-test $lang \
+    --garbage-phones-list $dir/garbage_phones.txt \
+    --silence-phones-list $dir/silence_phones.txt \
+    --merge-weights "$merge_weights" \
+    --remove-mismatch-frames false \
+    --graph-dir "$graph_dir" \
+    $lang $data_dir $whole_data_dir $sat_model_dir $model_dir $dir
+fi
+
+###############################################################################
+# Train a neural network for SAD
+###############################################################################
+if [ $stage -le 3 ]; then
+	if [ $nnet_type == "stats" ]; then
+		# Train a STATS-pooling network for SAD
+		local/segmentation/tuning/train_stats_sad_1a.sh \
+		  --stage $nstage --train-stage $train_stage \
+		  --targets-dir ${targets_dir} \
+		  --data-dir ${whole_data_dir}_hires --affix "1a" || exit 1
+	
+	elif [ $nnet_type == "lstm" ]; then
+    # Train a TDNN+LSTM network for SAD
+    local/segmentation/tuning/train_lstm_sad_1a.sh \
+      --stage $nstage --train-stage $train_stage \
+      --targets-dir ${targets_dir} \
+      --data-dir ${whole_data_dir}_hires --affix "1a" || exit 1
+
+  fi
+fi
+
+exit 0;
diff --git a/egs/chime6/s5b_track2/local/train_ts-vad.sh b/egs/chime6/s5b_track2/local/train_ts-vad.sh
new file mode 100755
index 00000000000..d16dc790412
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/train_ts-vad.sh
@@ -0,0 +1,422 @@
+#!/bin/bash
+# Copyright  2020  Ivan Medennikov (STC-innovations Ltd)
+# Apache 2.0
+
+# This script trains TS-VAD model using the same training data
+# as in the baseline acoustic model.
+
+. ./path.sh
+. ./cmd.sh
+
+# Begin configuration section.
+stage=0
+train_stage=-10
+srand=0
+
+# Training options
+num_epochs=2
+lrate=0003
+l2=0.002
+l2o=0.001
+common_egs_dir=
+remove_egs=true
+
+lang=data/lang
+silphonelist=1:2:3:4:5:21:22:23:24:25
+spnphonelist=
+
+sa=60 #number of seconds to sub-split speakers
+basedata=train_worn_simu_u400k_cleaned_sp
+srcdata=${basedata}_${sa}s
+data=${srcdata}_hires
+lats=${PWD}/exp/tri3_cleaned_ali_${basedata}
+nnet3_affix=_train_worn_simu_u400k_cleaned_rvb
+affix=1a
+
+tardir=$lats/VAD_targets
+targets=$tardir/dense-4H/dense_targets.scp
+ivector_dir=${PWD}/exp/nnet3${nnet3_affix}
+nj_ivec=128
+nj_paste=48
+dir=exp/ts-vad_$affix
+
+chime6_corpus=${PWD}/CHiME6
+json_dir=${chime6_corpus}/transcriptions
+json_ali=${PWD}/data/json_ali
+sess_list="S03 S04 S05 S06 S07 S08 S12 S13 S16 S17 S18 S19 S20 S22 S23 S24"
+sess_num=16
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+mdl=$lats/final.mdl
+[ ! -f $mdl ] && echo "$0: expected model file $mdl to exist!" && exit 1;
+ivdir=$ivector_dir/ivectors-offline_${data}
+iv4dir=$ivector_dir/ivectors-offline-4spk_${data}
+
+if [ $stage -le 0 ]; then
+  if [ ! -f data/${srcdata}_hires/.done ]; then
+    echo "Splitting speakers in ${basedata} into ${sa}-second subspeakers"
+    utils/data/modify_speaker_info.sh --seconds-per-spk-max $sa data/${basedata}_hires data/${srcdata}_hires
+    touch data/${srcdata}_hires/.done
+  fi
+fi
+
+if [ $stage -le 1 ]; then
+  outdir=$tardir
+  nj=$(cat $lats/num_jobs) || exit 1;
+  if [ -f $lats/ali.1.gz ]; then
+    if [ ! -f $outdir/.done ]; then
+      echo "Preparing per-utterance 1-speaker VAD targets from alignment"
+      $train_cmd JOB=1:$nj $outdir/log/ali_to_phones.JOB.log \
+        gunzip -c $lats/ali.JOB.gz \| \
+          ali-to-phones --frame-shift=0.01 --per-frame=true ${mdl} ark:- ark,t:$outdir/ali_phones.JOB.ark || exit 1;
+      $train_cmd JOB=1:$nj $outdir/log/conv_ali_to_vad.JOB.log \
+        python3 local/ts-vad/conv_ali_to_vad_012.py "$silphonelist" "$spnphonelist" $outdir/ali_phones.JOB.ark $outdir/ali_vad_targets.JOB.ark || exit 1
+      cat $outdir/ali_vad_targets.*.ark | sort > $outdir/ali_vad_targets.ark
+      if [ ! -f $outdir/targets.ark ]; then
+        vali_dst="ark,scp:$outdir/targets.ark,$outdir/targets.scp"
+        copy-int-vector "ark:$outdir/ali_vad_targets.ark" "$vali_dst" || exit 1
+      fi
+      touch $outdir/.done
+    fi
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  if [ ! -f $json_ali/.done ]; then
+    echo "Converting JSON to per-session VAD alignment (overlapped speech is considered as silence, to exclude these regions from i-vectors estimation)"
+    mkdir -p $json_ali
+    for json in `find $json_dir/ -name "*.json"`; do
+      sess=$(basename $json | sed s:.json::)
+      echo $sess
+      $train_cmd $json_ali/${sess}.log \
+        python local/ts-vad/make_json_align.py $json ark,t,scp:$json_ali/$sess.ark,$json_ali/${sess}.scp || exit 1;
+      $train_cmd $json_ali/${sess}_sp0.9.log \
+        python local/ts-vad/make_json_align.py --frame_shift 0.009 $json ark,t,scp:$json_ali/${sess}_sp0.9.ark,$json_ali/${sess}_sp0.9.scp || exit 1;
+      sed -i s:\ :_sp0.9\ : $json_ali/${sess}_sp0.9.scp
+      $train_cmd $json_ali/${sess}_sp1.1.log \
+        python local/ts-vad/make_json_align.py --frame_shift 0.011 $json ark,t,scp:$json_ali/${sess}_sp1.1.ark,$json_ali/${sess}_sp1.1.scp || exit 1;
+      sed -i s:\ :_sp1.1\ : $json_ali/${sess}_sp1.1.scp
+    done
+    cat $json_ali/*.scp > $json_ali/all_sess.scp
+    touch $json_ali/.done
+  fi
+fi
+
+if [ $stage -le 3 ]; then
+  ivdata=${srcdata}_hires
+  outdir=$ivdir
+  if [ ! -f $outdir/.lats-weights.done ]; then
+    echo 'Preparing weights for i-vectors extraction from ali/lats'
+    silence_weight=0.00001
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    acwt=0.1
+    if [ ! -f $lats/final.mdl ]; then
+      echo "$0: expected $lats/final.mdl to exist."
+      exit 1;
+    fi
+    if [ -f $lats/ali.1.gz ]; then
+      nj_orig=$(cat $lats/num_jobs) || exit 1;
+      rm $outdir/weights.*.gz 2>/dev/null
+      $train_cmd JOB=1:$nj_orig  $outdir/log/ali_to_post.JOB.log \
+        gunzip -c $lats/ali.JOB.gz \| \
+        ali-to-post ark:- ark:- \| \
+        weight-silence-post $silence_weight $silphonelist $lats/final.mdl ark:- ark:- \| \
+        post-to-weights ark:- "ark,t:|gzip -c >$outdir/weights.JOB.gz" || exit 1;
+      for j in $(seq $nj_orig); do gunzip -c $outdir/weights.$j.gz; done | gzip -c >$outdir/weights_lats.gz || exit 1;
+      rm $outdir/weights.*.gz || exit 1;
+    elif [ -f $lats/lat.1.gz ]; then
+      rm $outdir/weights.*.gz 2>/dev/null
+      $train_cmd JOB=1:$nj_orig  $outdir/log/lat_to_post.JOB.log \
+        lattice-best-path --acoustic-scale=$acwt "ark:gunzip -c $lats/lat.JOB.gz|" ark:/dev/null ark:- \| \
+        ali-to-post ark:- ark:- \| \
+        weight-silence-post $silence_weight $silphonelist $lats/final.mdl ark:- ark:- \| \
+        post-to-weights ark:- "ark:|gzip -c >$outdir/weights.JOB.gz" || exit 1;
+      for j in $(seq $nj_orig); do gunzip -c $outdir/weights.$j.gz; done | gzip -c >$outdir/weights_lats.gz || exit 1;
+      rm $outdir/weights.*.gz || exit 1;
+    else
+      echo "$0: expected ali.1.gz or lat.1.gz to exist in $lats";
+      exit 1;
+    fi
+    touch $outdir/.lats-weights.done
+  fi
+  if [ ! -f $outdir/.json-weights.done ]; then
+    echo 'Preparing weights for i-vectors extraction from json'
+    perl local/ts-vad/prepare_json_weights.pl data/$ivdata/segments $json_ali/all_sess.scp $outdir/weights_json.scp || exit 1;
+    touch $outdir/.json-weights.done
+  fi
+  if [ ! -f $outdir/.mult-weights.done ]; then
+    echo 'Multiplying weights from lats and json'
+    $train_cmd $outdir/multiply-vectors.log \
+      multiply-vectors --length-tolerance=2 ark:"gunzip -c $outdir/weights_lats.gz |" scp:$outdir/weights_json.scp ark,t:"| gzip -c >$outdir/weights_mult.gz" || exit 1;
+    touch $outdir/.mult-weights.done
+  fi
+  if [ ! -f $outdir/.done ]; then
+    echo 'Preparing single-speaker offline i-vectors'
+    local/ts-vad/extract_ivectors.sh --cmd $train_cmd --nj $nj_ivec \
+    --sub-speaker-frames 0 --max-count 100 \
+    data/$ivdata $lang $ivector_dir/extractor $outdir/weights_mult.gz $outdir || exit 1;
+    touch $outdir/.done
+  fi
+fi
+
+if [ $stage -le 4 ]; then
+  outdir=$iv4dir
+  if [ ! -f $outdir/.done ]; then
+    mkdir -p $outdir
+    echo 'Preparing 4-speaker i-vectors'
+    if [ ! -f data/${srcdata}_hires/utt2spk_cl3 ]; then
+      echo 'Creating 3 negative utt2spk files with speakers from the same session'
+      local/ts-vad/make_negative_utt2spk.pl data/${srcdata}_hires/utt2spk \
+        data/${srcdata}_hires/utt2spk_cl1 data/${srcdata}_hires/utt2spk_cl2 data/${srcdata}_hires/utt2spk_cl3 || exit 1;
+    fi
+
+    cat $ivdir/ivectors_spk.*.ark > $outdir/ivectors_spk.ark
+    $train_cmd JOB=1:3 $outdir/log/apply-map.JOB.log \
+      local/ts-vad/apply_map.pl --permissive -f 2 $outdir/ivectors_spk.ark \<data/${srcdata}_hires/utt2spk_clJOB \| \
+      copy-vector ark:- ark,t,scp:$outdir/ivectors_utt_negJOB.ark,$outdir/ivectors_utt_negJOB.scp || exit 1;
+    ivector_dim=$[$(head -n 1 $ivdir/ivectors_spk.1.ark | wc -w) - 3] || exit 1;
+    base_feat_dim=$(feat-to-dim scp:data/${srcdata}_hires/feats.scp -) || exit 1;
+    start_dim=$base_feat_dim
+    end_dim=$[$base_feat_dim+4*$ivector_dim-1]
+    absdir=$(utils/make_absolute.sh $outdir)
+    cp $ivdir/{ivector_period,final.ie.id} $outdir/
+    ivector_period=$(cat $ivdir/ivector_period)
+
+    [ ! -f $outdir/ivectors_utt_pos.scp ] && copy-vector ark:"cat $ivdir/ivectors_utt.*.ark |" ark,t,scp:$outdir/ivectors_utt_pos.ark,$outdir/ivectors_utt_pos.scp
+
+    if [ ! -f data/${srcdata}_hires/utt2spk_shuf4 ]; then
+      echo 'Shuffling original and 3 negative i-vectors and utt2spk'
+      local/ts-vad/shuffle_4spk_scp_utt2spk.pl $outdir/ivectors_utt_pos.scp $outdir/ivectors_utt_neg1.scp $outdir/ivectors_utt_neg2.scp $outdir/ivectors_utt_neg3.scp \
+        $outdir/ivectors_utt_shuf.1.scp $outdir/ivectors_utt_shuf.2.scp $outdir/ivectors_utt_shuf.3.scp $outdir/ivectors_utt_shuf.4.scp \
+        data/${srcdata}_hires/utt2spk data/${srcdata}_hires/utt2spk_cl1 data/${srcdata}_hires/utt2spk_cl2 data/${srcdata}_hires/utt2spk_cl3 \
+        data/${srcdata}_hires/utt2spk_shuf1 data/${srcdata}_hires/utt2spk_shuf2 data/${srcdata}_hires/utt2spk_shuf3 data/${srcdata}_hires/utt2spk_shuf4 || exit 1;
+    fi
+
+    utils/split_data.sh data/${srcdata}_hires $nj_paste
+
+    $train_cmd JOB=1:$nj_paste $outdir/log/paste_vectors.JOB.log \
+      paste-vectors scp:"utils/filter_scp.pl data/${srcdata}_hires/split$nj_paste/JOB/utt2spk $outdir/ivectors_utt_shuf.1.scp |" \
+                    scp:"utils/filter_scp.pl data/${srcdata}_hires/split$nj_paste/JOB/utt2spk $outdir/ivectors_utt_shuf.2.scp |" \
+                    scp:"utils/filter_scp.pl data/${srcdata}_hires/split$nj_paste/JOB/utt2spk $outdir/ivectors_utt_shuf.3.scp |" \
+                    scp:"utils/filter_scp.pl data/${srcdata}_hires/split$nj_paste/JOB/utt2spk $outdir/ivectors_utt_shuf.4.scp |" \
+                    ark,scp:$outdir/ivectors_utt_4ivc.JOB.ark,$outdir/ivectors_utt_4ivc.JOB.scp || exit 1;
+
+    $train_cmd JOB=1:$nj_paste $outdir/log/duplicate_feats.JOB.log \
+      append-vector-to-feats scp:data/${srcdata}_hires/split$nj_paste/JOB/feats.scp scp:$outdir/ivectors_utt_4ivc.JOB.scp ark:- \| \
+      select-feats "$start_dim-$end_dim" ark:- ark:- \| \
+      subsample-feats --n=$ivector_period ark:- ark:- \| \
+      copy-feats --compress=true ark:- \
+      ark,scp:$absdir/ivector_online.JOB.ark,$absdir/ivector_online.JOB.scp || exit 1;
+
+    cat $outdir/ivector_online.*.scp | sort > $outdir/ivector_online.scp
+    touch $outdir/.done
+  fi
+fi
+
+if [ $stage -le 5 ]; then
+  outdir=$(dirname $targets)
+  mkdir -p $outdir
+  tmp=$(dirname $outdir)
+  mkdir -p $tmp/tmp_sess
+  nj=$(cat $lats/num_jobs) || exit 1;
+  if [ ! -f $outdir/.done ]; then
+    echo 'Creating 8-dimensional dense targets for TS-VAD training'
+    [ ! -f $tmp/ali_vad_targets_wk.ark ] && grep -v "rev" $tmp/ali_vad_targets.ark > $tmp/ali_vad_targets_wk.ark
+    [ ! -f $tmp/segments_wk_ali ] && utils/filter_scp.pl $tmp/ali_vad_targets_wk.ark data/${srcdata}_hires/segments > $tmp/segments_wk_ali
+    [ ! -f data/${srcdata}_hires/utt2num_frames ] && feat-to-len scp:data/${srcdata}_hires/feats.scp ark,t:data/${srcdata}_hires/utt2num_frames
+
+    for p in `seq 4`; do
+      [ ! -f $tmp/utt2spk_shuf${p} ] && cat data/${srcdata}_hires/utt2spk_shuf${p} | sort > $tmp/utt2spk_shuf${p}
+    done
+
+    nj_dense=$((sess_num*3))
+    j=0
+    for sess in $sess_list; do
+      jp=$((3*j+1))
+      for p in `seq 4`; do
+        [ ! -f $tmp/tmp_sess/utt2spk_shuf${p}.$jp ] && grep "$sess" $tmp/utt2spk_shuf${p} | grep -v "sp" > $tmp/tmp_sess/utt2spk_shuf${p}.$jp
+      done
+      [ ! -f $tmp/tmp_sess/ali_vad_targets_wk.$jp.ark ] && grep "$sess" $tardir/ali_vad_targets_wk.ark | grep -v "sp" > $tmp/tmp_sess/ali_vad_targets_wk.$jp.ark
+
+      jp=$((3*j+2))
+      for p in `seq 4`; do
+        [ ! -f $tmp/tmp_sess/utt2spk_shuf${p}.$jp ] && grep "$sess" $tmp/utt2spk_shuf${p} | grep "sp0.9" > $tmp/tmp_sess/utt2spk_shuf${p}.$jp
+      done
+      [ ! -f $tmp/tmp_sess/ali_vad_targets_wk.$jp.ark ] && grep "$sess" $tardir/ali_vad_targets_wk.ark | grep "sp0.9" > $tmp/tmp_sess/ali_vad_targets_wk.$jp.ark
+
+      jp=$((3*j+3))
+      for p in `seq 4`; do
+        [ ! -f $tmp/tmp_sess/utt2spk_shuf${p}.$jp ] && grep "$sess" $tmp/utt2spk_shuf${p} | grep "sp1.1" > $tmp/tmp_sess/utt2spk_shuf${p}.$jp
+      done
+      [ ! -f $tmp/tmp_sess/ali_vad_targets_wk.$jp.ark ] && grep "$sess" $tardir/ali_vad_targets_wk.ark | grep "sp1.1" > $tmp/tmp_sess/ali_vad_targets_wk.$jp.ark
+      j=$((j+1))
+    done
+
+    $train_cmd JOB=1:$nj_dense $outdir/log/prepare_targets.JOB.log \
+      python3 local/ts-vad/conv_vad_to_dense_targets.py $tmp/tmp_sess/ali_vad_targets_wk.JOB.ark "ark,t,scp:$outdir/dense_targets.JOB.ark,$outdir/dense_targets.JOB.scp" \
+      $tmp/tmp_sess/utt2spk_shuf1.JOB $tmp/tmp_sess/utt2spk_shuf2.JOB $tmp/tmp_sess/utt2spk_shuf3.JOB $tmp/tmp_sess/utt2spk_shuf4.JOB \
+      data/${srcdata}_hires/segments $tmp/segments_wk_ali data/${srcdata}_hires/utt2num_frames || exit 1;
+    cat $outdir/dense_targets.*.scp | sort > $targets
+
+    # some diagnostics
+    compute-cmvn-stats scp:$outdir/dense_targets.1.scp - | cmvn-to-nnet - $outdir/S03.cmvn.nnet
+    compute-cmvn-stats scp:$outdir/dense_targets.2.scp - | cmvn-to-nnet - $outdir/S03_sp0.9.cmvn.nnet
+    compute-cmvn-stats scp:$outdir/dense_targets.3.scp - | cmvn-to-nnet - $outdir/S03_sp1.1.cmvn.nnet
+
+    touch $outdir/.done
+  fi
+fi
+
+if [ $stage -le 14 ]; then
+  mark=$dir/.done_cfg
+  if [ ! -f $mark ]; then
+    echo "Creating neural net configs using the xconfig parser"
+    feat_dim=40
+    num_targets=8
+    mkdir -p $dir/configs
+    output_opts="l2-regularize=$l2o"
+    lstm_opts="l2-regularize=$l2"
+    linear_opts="l2-regularize=$l2 orthonormal-constraint=-1.0"
+    cnn_opts="l2-regularize=$l2"
+
+    rproj=128
+    nproj=32
+    cell=896
+    cat <<EOF > $dir/configs/network.xconfig
+    input dim=400 name=ivector
+    input dim=${feat_dim} name=input
+    idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
+    batchnorm-component name=batchnorm input=idct
+
+    stats-layer name=mean config=mean(-150:1:1:150) input=batchnorm
+    no-op-component name=batchnorm-cmn input=Sum(batchnorm,Scale(-1.0,mean))
+
+    no-op-component name=ivector-all input=ReplaceIndex(ivector,t,0)
+    dim-range-component name=ivector-1 input=ivector-all dim=100 dim-offset=0
+    dim-range-component name=ivector-2 input=ivector-all dim=100 dim-offset=100
+    dim-range-component name=ivector-3 input=ivector-all dim=100 dim-offset=200
+    dim-range-component name=ivector-4 input=ivector-all dim=100 dim-offset=300
+
+    combine-feature-maps-layer name=combine_inputs input=Append(batchnorm, batchnorm-cmn) num-filters1=1 num-filters2=1 height=$feat_dim
+    conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+    conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+    conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
+    conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
+
+    linear-component $linear_opts name=aff1 input=Append(cnn4,ivector-1) dim=$((3*rproj))
+    fast-lstmp-layer name=blstm1-1-forward input=aff1 cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=-1 $lstm_opts
+    fast-lstmp-layer name=blstm1-1-backward input=aff1 cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=1 $lstm_opts
+    fast-lstmp-layer name=blstm2-1-forward input=Append(blstm1-1-forward, blstm1-1-backward) cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=-2 $lstm_opts
+    fast-lstmp-layer name=blstm2-1-backward input=Append(blstm1-1-forward, blstm1-1-backward) cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=2 $lstm_opts
+
+    linear-component $linear_opts name=aff2 input=Append(cnn4,ivector-2) dim=$((3*rproj))
+    fast-lstmp-layer name=blstm1-2-forward input=aff2 cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=-1 $lstm_opts
+    fast-lstmp-layer name=blstm1-2-backward input=aff2 cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=1 $lstm_opts
+    fast-lstmp-layer name=blstm2-2-forward input=Append(blstm1-2-forward, blstm1-2-backward) cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=-2 $lstm_opts
+    fast-lstmp-layer name=blstm2-2-backward input=Append(blstm1-2-forward, blstm1-2-backward) cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=2 $lstm_opts
+
+    linear-component $linear_opts name=aff3 input=Append(cnn4,ivector-3) dim=$((3*rproj))
+    fast-lstmp-layer name=blstm1-3-forward input=aff3 cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=-1 $lstm_opts
+    fast-lstmp-layer name=blstm1-3-backward input=aff3 cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=1 $lstm_opts
+    fast-lstmp-layer name=blstm2-3-forward input=Append(blstm1-3-forward, blstm1-3-backward) cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=-2 $lstm_opts
+    fast-lstmp-layer name=blstm2-3-backward input=Append(blstm1-3-forward, blstm1-3-backward) cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=2 $lstm_opts
+
+    linear-component $linear_opts name=aff4 input=Append(cnn4,ivector-4) dim=$((3*rproj))
+    fast-lstmp-layer name=blstm1-4-forward input=aff4 cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=-1 $lstm_opts
+    fast-lstmp-layer name=blstm1-4-backward input=aff4 cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=1 $lstm_opts
+    fast-lstmp-layer name=blstm2-4-forward input=Append(blstm1-4-forward, blstm1-4-backward) cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=-2 $lstm_opts
+    fast-lstmp-layer name=blstm2-4-backward input=Append(blstm1-4-forward, blstm1-4-backward) cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=2 $lstm_opts
+
+    fast-lstmp-layer name=blstm3-forward input=Append(blstm2-1-forward, blstm2-1-backward, blstm2-2-forward, blstm2-2-backward, blstm2-3-forward, blstm2-3-backward, blstm2-4-forward, blstm2-4-backward) cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=-3 $lstm_opts
+    fast-lstmp-layer name=blstm3-backward input=Append(blstm2-1-forward, blstm2-1-backward, blstm2-2-forward, blstm2-2-backward, blstm2-3-forward, blstm2-3-backward, blstm2-4-forward, blstm2-4-backward) cell-dim=$cell recurrent-projection-dim=$rproj non-recurrent-projection-dim=$nproj delay=3 $lstm_opts
+
+    output-layer $output_opts input=Append(blstm3-forward, blstm3-backward) name=output dim=2
+    output-layer $output_opts input=Append(blstm3-forward, blstm3-backward) name=output2 dim=2
+    output-layer $output_opts input=Append(blstm3-forward, blstm3-backward) name=output3 dim=2
+    output-layer $output_opts input=Append(blstm3-forward, blstm3-backward) name=output4 dim=2
+EOF
+    steps/nnet3/xconfig_to_configs.py \
+      --xconfig-file $dir/configs/network.xconfig \
+      --config-dir $dir/configs || exit 1
+    echo "num_targets=$num_targets" >> $dir/configs/vars
+
+    echo "Modifying final.config file to combine 4 softmax layers in the output-node "
+    sed -i 's:output\.:output1\.:g' $dir/configs/final.config
+    mv $dir/configs/final.config $dir/configs/final.config.tmp
+    grep -v "output\-node" $dir/configs/final.config.tmp > $dir/configs/final.config
+    echo "output-node name=output input=Append(output1.log-softmax, output2.log-softmax, output3.log-softmax, output4.log-softmax)" >> $dir/configs/final.config
+
+    echo "Modifying final.config file to enforce weigths sharing in affine and blstm layers"
+    sed -i s:component\ name=aff1:component\ name=aff-uni: $dir/configs/final.config
+    sed -i s:component=aff1:component=aff-uni: $dir/configs/final.config
+    sed -i s:component=aff2:component=aff-uni: $dir/configs/final.config
+    sed -i s:component=aff3:component=aff-uni: $dir/configs/final.config
+    sed -i s:component=aff4:component=aff-uni: $dir/configs/final.config
+    sed -i s:component\ name=blstm1-1:component\ name=blstm1-uni: $dir/configs/final.config
+    sed -i s:component\ name=blstm2-1:component\ name=blstm2-uni: $dir/configs/final.config
+    sed -i s:component=blstm1-1:component=blstm1-uni: $dir/configs/final.config
+    sed -i s:component=blstm1-2:component=blstm1-uni: $dir/configs/final.config
+    sed -i s:component=blstm1-3:component=blstm1-uni: $dir/configs/final.config
+    sed -i s:component=blstm1-4:component=blstm1-uni: $dir/configs/final.config
+    sed -i s:component=blstm2-1:component=blstm2-uni: $dir/configs/final.config
+    sed -i s:component=blstm2-2:component=blstm2-uni: $dir/configs/final.config
+    sed -i s:component=blstm2-3:component=blstm2-uni: $dir/configs/final.config
+    sed -i s:component=blstm2-4:component=blstm2-uni: $dir/configs/final.config
+    mv $dir/configs/final.config $dir/configs/final.config.tmp
+    grep -v "component\ name=aff2" $dir/configs/final.config.tmp | grep -v "component\ name=aff3" |  grep -v "component\ name=aff4" | \
+    grep -v "component\ name=blstm1-2" | grep -v "component\ name=blstm1-3" | grep -v "component\ name=blstm1-4" | \
+    grep -v "component\ name=blstm2-2" | grep -v "component\ name=blstm2-3" | grep -v "component\ name=blstm2-4" > $dir/configs/final.config
+    nnet3-init --binary=false $dir/configs/final.config $dir/configs/init.raw || exit 1;
+    touch $mark
+  fi
+fi
+
+if [ ! -f data/$data/utt2uniq.done ]; then
+  [ -f data/$data/utt2uniq ] && mv data/$data/utt2uniq data/$data/utt2uniq.bak
+  local/ts-vad/make_utt2uniq.pl data/$data/utt2spk data/$data/utt2uniq || exit 1;
+  touch data/$data/utt2uniq.done
+fi
+
+if [ $stage -le 15 ]; then
+  mark=$dir/.done_dnn
+  if [ ! -f $mark ]; then
+    cp "$(readlink -f $0)" "$dir"
+    steps/nnet3/train_raw_rnn.py \
+      --stage=$train_stage \
+      --cmd="$train_cmd" \
+      --feat.online-ivector-dir=$iv4dir \
+      --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+      --trainer.srand=$srand \
+      --trainer.max-param-change=2 \
+      --trainer.num-epochs=$num_epochs \
+      --trainer.optimization.proportional-shrink 10 \
+      --trainer.optimization.momentum=0.5 \
+      --trainer.optimization.num-jobs-initial=2 \
+      --trainer.optimization.num-jobs-final=4 \
+      --trainer.optimization.initial-effective-lrate=0.$lrate \
+      --trainer.optimization.final-effective-lrate=0.0$lrate \
+      --trainer.rnn.num-chunk-per-minibatch=128 \
+      --trainer.samples-per-iter=15000 \
+      --egs.chunk-left-context=30 \
+      --egs.chunk-right-context=30 \
+      --egs.chunk-width=40 \
+      --use-dense-targets true \
+      --feat-dir data/$data \
+      --targets-scp $targets \
+      --egs.cmd=run.pl \
+      --egs.dir=$common_egs_dir \
+      --cleanup.remove-egs false \
+      --cleanup.preserve-model-interval=100 \
+      --use-gpu=true \
+      --dir=$dir || exit 1
+    touch $mark
+  fi
+fi
+
+echo Done
diff --git a/egs/chime6/s5b_track2/local/truncate_rttm.py b/egs/chime6/s5b_track2/local/truncate_rttm.py
new file mode 100755
index 00000000000..3de0c0a60d6
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/truncate_rttm.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+# Apache 2.0
+# This script truncates the rttm file
+# using UEM file and writes it to a new rttm file
+#
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+from scorelib.turn import trim_turns
+import scorelib.rttm as rttm_func
+from scorelib.uem import load_uem
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="""This script truncates the rttm file
+                       using UEM file""")
+    parser.add_argument("rttm_file", type=str,
+                        help="""Input RTTM file.
+                            The format of the RTTM file is
+                            <type> <file-id> <channel-id> <begin-time> """
+                             """<end-time> <NA> <NA> <speaker> <conf>""")
+    parser.add_argument("uem_file", type=str,
+                        help="""Input UEM file.
+                            The format of the UEM file is
+                            <file-id> <channel-id> <begin-time> <end-time>""")
+    parser.add_argument("rttm_file_write", type=str,
+                        help="""output RTTM file.""")
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = get_args()
+    rttm_writer = open(args.rttm_file_write, 'w')
+    turns, speaker_ids, file_ids = rttm_func.load_rttm(args.rttm_file)
+    loaded_uem = load_uem(args.uem_file)
+    truncated_turns = trim_turns(turns, loaded_uem)
+    rttm_func.write_rttm(args.rttm_file_write,truncated_turns)
diff --git a/egs/chime6/s5b_track2/local/ts-vad/apply_map.pl b/egs/chime6/s5b_track2/local/ts-vad/apply_map.pl
new file mode 100755
index 00000000000..6a61cf647cb
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/ts-vad/apply_map.pl
@@ -0,0 +1,98 @@
+#!/usr/bin/env perl
+use warnings; #sed replacement for -w perl parameter
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# This program is a bit like ./sym2int.pl in that it applies a map
+# to things in a file, but it's a bit more general in that it doesn't
+# assume the things being mapped to are single tokens, they could
+# be sequences of tokens.  See the usage message.
+# Compared to the utils/apply_map.pl, permissive mode is fixed.
+
+$permissive = 0;
+
+for ($x = 0; $x <= 2; $x++) {
+
+  if (@ARGV > 0 && $ARGV[0] eq "-f") {
+    shift @ARGV;
+    $field_spec = shift @ARGV;
+    if ($field_spec =~ m/^\d+$/) {
+      $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
+    }
+    if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
+      if ($1 ne "") {
+        $field_begin = $1 - 1;  # Change to zero-based indexing.
+      }
+      if ($2 ne "") {
+        $field_end = $2 - 1;    # Change to zero-based indexing.
+      }
+    }
+    if (!defined $field_begin && !defined $field_end) {
+      die "Bad argument to -f option: $field_spec";
+    }
+  }
+
+  if (@ARGV > 0 && $ARGV[0] eq '--permissive') {
+    shift @ARGV;
+    # Mapping is optional (missing key is printed to output)
+    $permissive = 1;
+  }
+}
+
+if(@ARGV != 1) {
+  print STDERR "Invalid usage: " . join(" ", @ARGV) . "\n";
+  print STDERR <<'EOF';
+Usage: apply_map.pl [options] map <input >output
+ options: [-f <field-range> ] [--permissive]
+   This applies a map to some specified fields of some input text:
+   For each line in the map file: the first field is the thing we
+   map from, and the remaining fields are the sequence we map it to.
+   The -f (field-range) option says which fields of the input file the map
+   map should apply to.
+   If the --permissive option is supplied, fields which are not present
+   in the map will be left as they were.
+ Applies the map 'map' to all input text, where each line of the map
+ is interpreted as a map from the first field to the list of the other fields
+ Note: <field-range> can look like 4-5, or 4-, or 5-, or 1, it means the field
+ range in the input to apply the map to.
+ e.g.: echo A B | apply_map.pl a.txt
+ where a.txt is:
+ A a1 a2
+ B b
+ will produce:
+ a1 a2 b
+EOF
+  exit(1);
+}
+
+($map_file) = @ARGV;
+open(M, "<$map_file") || die "Error opening map file $map_file: $!";
+
+while (<M>) {
+  @A = split(" ", $_);
+  @A >= 1 || die "apply_map.pl: empty line.";
+  $i = shift @A;
+  $o = join(" ", @A);
+  $map{$i} = $o;
+}
+
+CU: while(<STDIN>) {
+  @A = split(" ", $_);
+  for ($x = 0; $x < @A; $x++) {
+    if ( (!defined $field_begin || $x >= $field_begin)
+         && (!defined $field_end || $x <= $field_end)) {
+      $a = $A[$x];
+      if (!defined $map{$a}) {
+        if (!$permissive) {
+          die "apply_map.pl: undefined key $a in $map_file\n";
+        } else {
+          print STDERR "apply_map.pl: warning! missing key $a in $map_file\n";
+          next CU;
+        }
+      } else {
+        $A[$x] = $map{$a};
+      }
+    }
+  }
+  print join(" ", @A) . "\n";
+}
diff --git a/egs/chime6/s5b_track2/local/ts-vad/compute_ts-vad_weights.sh b/egs/chime6/s5b_track2/local/ts-vad/compute_ts-vad_weights.sh
new file mode 100755
index 00000000000..54e6673ad16
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/ts-vad/compute_ts-vad_weights.sh
@@ -0,0 +1,164 @@
+#!/bin/bash
+# Copyright 2012  Brno University of Technology (Author: Karel Vesely)
+#           2013  Johns Hopkins University (Author: Daniel Povey)
+#           2015  Vijayaditya Peddinti
+#           2016  Vimal Manohar
+#           2017  Pegah Ghahremani
+#           2020  Ivan Medennikov (STC-innovations Ltd)
+# Apache 2.0
+
+# Computes TS-VAD weights using raw nnet3 network.
+
+# Begin configuration section.
+nj=4
+cmd=run.pl
+stage=0
+# Begin configuration.
+srcdir=
+frames_per_chunk=50
+extra_left_context=0
+extra_right_context=0
+extra_left_context_initial=-1
+extra_right_context_final=-1
+online_ivector_dir=
+graphs_scp=
+max_jobs_run=20
+n_spk=4
+
+normalize_transform=
+add_deltas=false
+delta_opts=
+num_threads=1
+use_gpu=true
+mb_size=128
+optimize=false
+apply_exp=true
+use_subsampling=false
+# End configuration options.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Usage: $0 <data-dir> <extractor> <bn-dir>"
+   echo "e.g.: $0 data/train exp/nnet4/bnex.raw data_bn/train"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+data=$1
+extractor=$2
+dir=$3
+
+if [ -f $dir/.done ]; then
+  echo "$0: $dir/.done already exists!"
+  exit 0;
+fi
+
+[ -z $srcdir ] && srcdir=`dirname $extractor`
+
+mkdir -p $dir/{log,tmp}
+sdata=$data/split${nj}
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || \
+   split_data.sh $data $nj || exit 1;
+
+extra_files=
+if [ ! -z "$online_ivector_dir" ]; then
+  steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
+  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+fi
+
+for f in $extractor $data/feats.scp $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
+[ ! -z "$delta_opts" ] && add_deltas=true
+
+[ -z "$normalize_transform" ] && [ -f $srcdir/normalize.feature_transform ] && normalize_transform=$srcdir/normalize.feature_transform
+echo "normalize transform file: $normalize_transform"
+
+cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
+
+feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+
+if [ ! -z "$normalize_transform" ]; then
+  feats="$feats nnet-forward $normalize_transform ark:- ark:- |"
+fi
+
+if $add_deltas; then
+  feats="$feats add-deltas $delta_opts ark:- ark:- |"
+fi
+
+ivector_opts=
+if [ ! -z "$online_ivector_dir" ]; then
+  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+fi
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ] && $use_subsampling ; then
+  # e.g. for 'chain' systems
+  frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor)
+  frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
+  cp $srcdir/frame_subsampling_factor $dir
+  if [[ $frame_subsampling_factor -gt 1 ]]; then
+    # Assume a chain system, check agrument sanity.
+    if [[ ! ($scale_opts == *--self-loop-scale=1.0* &&
+             $scale_opts == *--transition-scale=1.0* &&
+             $acoustic_scale = '1.0') ]]; then
+      echo "$0: ERROR: frame-subsampling-factor is not 1, assuming a chain system."
+      echo "... You should pass the following options to this script:"
+      echo "  --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0'" \
+           "--acoustic_scale 1.0"
+    fi
+  fi
+fi
+
+##
+gpu_opt=
+thread_string=
+if $use_gpu ; then
+  thread_string="-batch --minibatch-size=$mb_size"
+  gpu_opt="--gpu 1"
+  use_gpu=wait
+else
+  echo "Warning! GPU is disabled, are you okay?"
+  thread_string=""
+  use_gpu=no
+fi
+
+if [ $stage -le 1 ]; then
+  $cmd --max-jobs-run $max_jobs_run $gpu_opt JOB=1:$nj $dir/log/nnet3_compute.JOB.log \
+    nnet3-compute$thread_string $ivector_opts $frame_subsampling_opt \
+      --apply-exp=$apply_exp \
+      --frames-per-chunk=$frames_per_chunk \
+      --extra-left-context=$extra_left_context \
+      --extra-right-context=$extra_right_context \
+      --extra-left-context-initial=$extra_left_context_initial \
+      --extra-right-context-final=$extra_right_context_final \
+      --use-gpu=$use_gpu \
+      $extractor "$feats" ark,t:$dir/tmp/outputs.JOB.ark || exit 1;
+  cat $dir/tmp/outputs.*.ark > $dir/outputs.ark
+  rm $dir/tmp/outputs.*.ark
+fi
+
+if [ $stage -le 2 ]; then
+  [ -f $dir/weights.ark ] && rm $dir/weights.ark
+  for i in `seq $n_spk`; do
+    $cmd $dir/log/make_weights.$i.log \
+      select-feats $((2*i-1)) ark:$dir/outputs.ark ark:- \| \
+      feat-to-post ark:- ark:- \| \
+      post-to-weights ark:- ark,t:"| sed s/\ /-$i\ / > $dir/weights.$i.ark" || exit 1;
+  done
+  cat $dir/weights.*.ark | sort > $dir/weights.ark
+  rm $dir/outputs.ark
+  rm $dir/weights.*.ark
+fi
+
+echo "$0: done extracting weights"
diff --git a/egs/chime6/s5b_track2/local/ts-vad/conv_ali_to_vad_012.py b/egs/chime6/s5b_track2/local/ts-vad/conv_ali_to_vad_012.py
new file mode 100644
index 00000000000..291ca3e8d67
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/ts-vad/conv_ali_to_vad_012.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+
+# Copyright  2020  Yuri Khokhlov, Ivan Medennikov (STC-innovations Ltd)
+# Apache 2.0.
+
+"""This script transforms phone-indices in alignment to 0(silence phones), 1(speech phones), 2(spn phones)"""
+
+import os
+import argparse
+import numpy as np
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Usage: conv_ali_to_vad_012.py 1:2:3:4:5 6:7:8:9:10 <in-text-phone-ali> <out-text-vad-ali>')
+    parser.add_argument('silence_phones', type=str)
+    parser.add_argument('spn_phones', type=str)
+    parser.add_argument('phone_ali', type=str)
+    parser.add_argument('vad_ali', type=str)
+    args = parser.parse_args()
+
+    print('Options:')
+    print('  Silence phones (colon-separated list): {}'.format(args.silence_phones))
+    print('  Spoken-noise phones (colon-separated list): {}'.format(args.spn_phones))
+    print('  Input phone ali in text format: {}'.format(args.phone_ali))
+    print('  Output vad ali in text format: {}'.format(args.vad_ali))
+
+    silence_set = set(args.silence_phones.split(':'))
+    print("sil phones: ")
+    print(args.silence_phones.split(':'))
+    spn_set = set(args.spn_phones.split(':'))
+    print("spn phones: ")
+    print(args.spn_phones.split(':'))
+
+    assert os.path.exists(args.phone_ali), 'File does not exist {}'.format(args.phone_ali)
+    parent = os.path.dirname(os.path.abspath(args.vad_ali))
+    if not os.path.exists(parent):
+        os.makedirs(parent)
+
+    print('Starting to convert')
+    count = 0
+    with open(args.phone_ali) as ali_file:
+        with open(args.vad_ali, 'wt') as vad_file:
+            for line in ali_file:
+                line = line.strip()
+                if len(line) == 0:
+                    continue
+                parts = line.split(' ')
+                parts = list(filter(None, parts))
+                assert len(parts) > 1, 'Empty alignment in line {}'.format(line)
+                vad_file.write('{}'.format(parts[0]))
+                phones = parts[1:]
+                for phone in phones:
+                    if phone in silence_set:
+                        vad_file.write(' 0')
+                    elif phone in spn_set:
+                        vad_file.write(' 2')
+                    else: 
+                        vad_file.write(' 1')
+                vad_file.write('\n')
+                count += 1
+            vad_file.close()
+        ali_file.close()
+    print('Converted alignments for {} utterances'.format(count))
+
diff --git a/egs/chime6/s5b_track2/local/ts-vad/conv_vad_to_dense_targets.py b/egs/chime6/s5b_track2/local/ts-vad/conv_vad_to_dense_targets.py
new file mode 100644
index 00000000000..cc5bbf4d93e
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/ts-vad/conv_vad_to_dense_targets.py
@@ -0,0 +1,336 @@
+#!/usr/bin/env python
+# Copyright  2020   Ivan Medennikov, Maxim Korenevsky (STC-innovations Ltd)
+# Apache 2.0.
+
+"""This script prepares overlapped 4-speaker dense targets for TS-VAD training
+   using segments and VAD alignment of kinect and worn utterances 
+   (VAD alignment from worn utterances is more reliable than from kinects,
+   so we use large scaling factor for worn utterances).
+   The resulting targets are 4 pairs of probabilities:
+   (sil_spk1, speech_spk1, sil_spk2, speech_spk2, sil_spk3, speech_spk3, sil_spk4, speech_spk4)"""
+
+import os
+import argparse
+import numpy as np
+from kaldiio import WriteHelper
+
+def ProcessSession(segments, writer, worn_scale):
+    segments.sort(key = lambda tup: tup[6])     #sort by start time
+    for i in range(len(segments)):
+        utt_id, spk, n_spk, n_spk2, n_spk3, n_spk4, start, end, vad_info, device = segments[i]
+        if n_spk == '':
+            continue
+
+        vad_info_dense = np.zeros((vad_info[0],8))
+
+        # looking for left-side overlappings
+        i1 = i-1
+        cnt = 0
+        nls=0
+        nl1=0
+        nl2=0
+        nl3=0
+        nl4=0
+        while i1>=0:
+            utt_id1, spk1, x, y, z, q, start1, end1, vad_info1, device1 = segments[i1]
+            if x != '':
+                i1 -= 1
+                continue
+            if end1 > start:
+                scale = 1
+                if device1 == 'W':
+                    scale = worn_scale
+                if spk1 == spk:
+                    nls+=1
+                if spk1 == n_spk:
+                    nl1+=1
+                    #print('utt {}, spk {}: left intersection with spk 1 utt {}, adding scale {}'.format(utt_id,spk1,utt_id1,scale))
+                    for k in range(min(end,end1)-start):
+                        if vad_info1[start - start1 + k] == '1':
+                            vad_info_dense[k][1] += scale
+                        else:
+                            vad_info_dense[k][0] += scale
+                elif spk1 == n_spk2:
+                    nl2+=1
+                    #print('utt {}, spk {}: left intersection with spk 2 utt {}, adding scale {}'.format(utt_id,spk1,utt_id1,scale))
+                    for k in range(min(end,end1)-start):
+                        if vad_info1[start - start1 + k] == '1':
+                            vad_info_dense[k][3] += scale
+                        else:
+                            vad_info_dense[k][2] += scale
+                elif spk1 == n_spk3:
+                    nl3+=1
+                    #print('utt {}, spk {}: left intersection with spk 3 utt {}, adding scale {}'.format(utt_id,spk1,utt_id1,scale))
+                    for k in range(min(end,end1)-start):
+                        if vad_info1[start - start1 + k] == '1':
+                            vad_info_dense[k][5] += scale
+                        else:
+                            vad_info_dense[k][4] += scale
+                elif spk1 == n_spk4:
+                    nl4+=1
+                    #print('utt {}, spk {}: left intersection with spk 4 utt {}, adding scale {}'.format(utt_id,spk1,utt_id1,scale))
+                    for k in range(min(end,end1)-start):
+                        if vad_info1[start - start1 + k] == '1':
+                            vad_info_dense[k][7] += scale
+                        else:
+                            vad_info_dense[k][6] += scale
+            else:
+                cnt += 1
+                if cnt==10:
+                    break
+            i1 -= 1
+
+        # looking for rights-side overlappings
+        i1 = i+1
+        cnt = 0
+        nrs=0
+        nr1=0
+        nr2=0
+        nr3=0
+        nr4=0
+        while i1<len(segments):
+            utt_id1, spk1, x, y, z, q, start1, end1, vad_info1, device1 = segments[i1]
+            if x != '':
+                i1 += 1
+                continue
+            if end > start1:
+                scale = 1
+                if device1 == 'W':
+                    scale = worn_scale
+                if spk1 == spk:
+                    nrs+=1
+                if spk1 == n_spk:
+                    nr1+=1
+                    #print('utt {}, spk {}: right intersection with spk 1 utt {}, adding scale {}'.format(utt_id,spk1,utt_id1,scale))
+                    for k in range(min(end, end1)-start1):
+                        if vad_info1[k] == '1':
+                            vad_info_dense[start1-start+k][1] += scale
+                        else:
+                            vad_info_dense[start1-start+k][0] += scale
+                elif spk1 == n_spk2:
+                    nr2+=1
+                    #print('utt {}, spk {}: right intersection with spk 2 utt {}, adding scale {}'.format(utt_id,spk1,utt_id1,scale))
+                    for k in range(min(end, end1)-start1):
+                        if vad_info1[k] == '1':
+                            vad_info_dense[start1-start+k][3] += scale
+                        else:
+                            vad_info_dense[start1-start+k][2] += scale
+                elif spk1 == n_spk3:
+                    nr3+=1
+                    #print('utt {}, spk {}: right intersection with spk 3 utt {}, adding scale {}'.format(utt_id,spk1,utt_id1,scale))
+                    for k in range(min(end, end1)-start1):
+                        if vad_info1[k] == '1':
+                            vad_info_dense[start1-start+k][5] += scale
+                        else:
+                            vad_info_dense[start1-start+k][4] += scale
+                elif spk1 == n_spk4:
+                    nr4+=1
+                    #print('utt {}, spk {}: right intersection with spk 4 utt {}, adding scale {}'.format(utt_id,spk1,utt_id1,scale))
+                    for k in range(min(end, end1)-start1):
+                        if vad_info1[k] == '1':
+                            vad_info_dense[start1-start+k][7] += scale
+                        else:
+                            vad_info_dense[start1-start+k][6] += scale
+            else:
+                cnt += 1
+                if cnt==10:
+                    break
+            i1 += 1
+
+        for j in range(vad_info[0]):
+            for head in range(4):
+                total=vad_info_dense[j][2*head]+vad_info_dense[j][2*head+1]
+                if total == 0:
+                    vad_info_dense[j][2*head] = 1
+                    vad_info_dense[j][2*head+1] = 0
+                else:
+                    vad_info_dense[j][2*head] /= total
+                    vad_info_dense[j][2*head+1] /= total
+
+        #print("utt {}: {}+{}+{}+{} left-overlaps and {} left-self-overlaps, {}+{}+{}+{} right-overlaps and {} rights-self-overlaps".format(utt_id,nl1,nl2,nl3,nl4,nls,nr1,nr2,nr3,nr4,nrs))
+        if nls == 0 and nrs == 0:
+            print("WARNING: utt {} does not have targets!".format(utt_id))
+            continue
+        writer(utt_id, vad_info_dense)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Usage: conv_vad_to_dense_targets.py <in-text-pos-vad-ali> <out-wspec> <utt2spk-1> <utt2spk-2> <utt2spk-3> <utt2spk-4> <segments-utt> <segments-ali> <utt2dur>')
+    parser.add_argument('vad_ali', type=str)
+    parser.add_argument('wspec', type=str)
+    parser.add_argument('utt2spk_n1', type=str)
+    parser.add_argument('utt2spk_n2', type=str)
+    parser.add_argument('utt2spk_n3', type=str)
+    parser.add_argument('utt2spk_n4', type=str)
+    parser.add_argument('segments_utt', type=str)
+    parser.add_argument('segments_ali', type=str)
+    parser.add_argument('utt2dur', type=str)
+    parser.add_argument('--worn_scale', type=float, default=10)
+
+    args = parser.parse_args()
+
+    print('Options:')
+    print('  Input vad ali in text format: {}'.format(args.vad_ali))
+    print('  Output wspecifier: {}'.format(args.wspec))
+    print('  Utterance-to-spk map for head #1: {}'.format(args.utt2spk_n1))
+    print('  Utterance-to-spk map for head #2: {}'.format(args.utt2spk_n2))
+    print('  Utterance-to-spk map for head #3: {}'.format(args.utt2spk_n3))
+    print('  Utterance-to-spk map for head #4: {}'.format(args.utt2spk_n4))
+    print('  Segments for uid: {}'.format(args.segments_utt))
+    print('  Segments for ali: {}'.format(args.segments_ali))
+    print('  Utt2dur (in frames) for uid: {}'.format(args.utt2dur))
+    print('  Worn scaling factor: {}'.format(args.worn_scale))
+
+    assert os.path.exists(args.vad_ali), 'File does not exist {}'.format(args.vad_ali)
+
+    print('Starting to convert')
+
+    print('Loading speaker info for head #1')
+    n_speakers=dict()
+    with open(args.utt2spk_n1,'r') as f:
+        for line in f:
+            uid, n_sid = line.strip().split()
+            n_spk = n_sid.split('_')[0]
+            n_spk_parts = n_spk.split('-')[:-2]
+            if n_spk_parts[0][:3]=='rev':
+                del n_spk_parts[0]
+            if len(n_spk_parts)>1 and n_spk_parts[1][:3]=='rev':
+                del n_spk_parts[1]
+            n_spk = '-'.join(n_spk_parts)
+            n_speakers[uid] = n_spk
+
+    print('Loading speaker info for head #2')
+    n_speakers2=dict()
+    with open(args.utt2spk_n2,'r') as f:
+        for line in f:
+            uid, n_sid = line.strip().split()
+            n_spk = n_sid.split('_')[0]
+            n_spk_parts = n_spk.split('-')[:-2]
+            if n_spk_parts[0][:3]=='rev':
+                del n_spk_parts[0]
+            if len(n_spk_parts)>1 and n_spk_parts[1][:3]=='rev':
+                del n_spk_parts[1]
+            n_spk = '-'.join(n_spk_parts)
+            n_speakers2[uid] = n_spk
+
+    print('Loading speaker info for head #3')
+    n_speakers3=dict()
+    with open(args.utt2spk_n3,'r') as f:
+        for line in f:
+            uid, n_sid = line.strip().split()
+            n_spk = n_sid.split('_')[0]
+            n_spk_parts = n_spk.split('-')[:-2]
+            if n_spk_parts[0][:3]=='rev':
+                del n_spk_parts[0]
+            if len(n_spk_parts)>1 and n_spk_parts[1][:3]=='rev':
+                del n_spk_parts[1]
+            n_spk = '-'.join(n_spk_parts)
+            n_speakers3[uid] = n_spk
+
+    print('Loading speaker info for head #4')
+    n_speakers4=dict()
+    with open(args.utt2spk_n4,'r') as f:
+        for line in f:
+            uid, n_sid = line.strip().split()
+            n_spk = n_sid.split('_')[0]
+            n_spk_parts = n_spk.split('-')[:-2]
+            if n_spk_parts[0][:3]=='rev':
+                del n_spk_parts[0]
+            if len(n_spk_parts)>1 and n_spk_parts[1][:3]=='rev':
+                del n_spk_parts[1]
+            n_spk = '-'.join(n_spk_parts)
+            n_speakers4[uid] = n_spk
+
+    print('Loading segments boundaries')
+    seg_by_uid = dict()
+    with open(args.segments_utt) as f:
+        for line in f:
+            uid, wav_id, start, end = line.strip().split()
+            seg_by_uid[uid]=(int(float(start)*100),int(float(end)*100))
+
+    print('Loading durations of utterances')
+    len_by_uid = dict()
+    with open(args.utt2dur) as f:
+        for line in f:
+            uid, length = line.strip().split()
+            len_by_uid[uid]=int(length)
+
+    print('Loading VAD alignment segments boundaries')
+    seg_by_ali_uid = dict()
+    with open(args.segments_ali) as f:
+        for line in f:
+            utt_id, wav_id, start, end = line.strip().split()
+            seg_by_ali_uid[utt_id]=(int(float(start)*100),int(float(end)*100))
+
+    print('Loading VAD alignment')
+    seg_by_sess = dict()
+    with open(args.vad_ali) as f:
+        for line in f:
+            vad_info = line.strip().split()
+            utt_id_ = vad_info[0]
+            vad_info = vad_info[1:]
+
+            utt_id = utt_id_
+            utt_id_parts=utt_id.split('-')
+            utt_id = '-'.join(utt_id_parts[:-3])
+            spk, sess, device = utt_id.split('_')[:3]
+            spk_parts = spk.split('-')
+            if spk_parts[0][:3]=='rev':
+                del spk_parts[0]
+            if len(spk_parts)>1 and spk_parts[1][:3]=='rev':
+                del spk_parts[1]
+            spk = '-'.join(spk_parts)
+
+            if device == 'NOLOCATION.L' or device == 'NOLOCATION.R':
+                device = 'W'
+
+            if sess not in seg_by_sess:
+                seg_by_sess[sess] = list()
+            start, end = seg_by_ali_uid[utt_id_]
+
+            assert end-start >= len(vad_info), '{} {} {}'.format(start, end, len(vad_info))
+            assert end-start-len(vad_info)<=3, '{} {} {}'.format(start, end, len(vad_info))
+            end = start + len(vad_info)
+            seg_by_sess[sess].append((utt_id_, spk, '', '', '', '', start, end, vad_info, device))
+
+    skip=0
+    for uid in n_speakers.keys():
+        n_spk = n_speakers[uid].split('_')[0]
+        if uid not in n_speakers2.keys():
+            skip+=1
+            continue
+        n_spk2 = n_speakers2[uid].split('_')[0]
+        if uid not in n_speakers3.keys():
+            skip+=1
+            continue
+        n_spk3 = n_speakers3[uid].split('_')[0]
+        if uid not in n_speakers4.keys():
+            skip+=1
+            continue
+        n_spk4 = n_speakers4[uid].split('_')[0]
+        spk, sess = uid.split('_')[:2]
+        spk_parts = spk.split('-')
+        if spk_parts[0][:3]=='rev':
+            del spk_parts[0]
+        if len(spk_parts)>1 and spk_parts[1][:3]=='rev':
+            del spk_parts[1]
+        spk = '-'.join(spk_parts)
+        if uid not in seg_by_uid.keys():
+            skip+=1
+            continue
+        start, end = seg_by_uid[uid]
+        if uid not in len_by_uid.keys():
+            skip+=1
+            continue
+        vad_info = len_by_uid[uid]
+        assert end-start >= vad_info, '{} {} {}'.format(start, end, vad_info)
+        assert end-start-vad_info<=3, '{} {} {}'.format(start, end, vad_info)
+        end = start + vad_info
+
+        seg_by_sess[sess].append((uid, spk, n_spk, n_spk2, n_spk3, n_spk4, start, end, [vad_info], 'kinect'))
+
+    print('{} utts are skipped as missing in utt2spk'.format(skip))
+    print('Processing segments session-by-session')
+    with WriteHelper(args.wspec) as writer:
+        for sess in seg_by_sess:
+            print(sess)
+            ProcessSession(seg_by_sess[sess], writer, args.worn_scale)
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/ts-vad/convert_prob_to_rttm.py b/egs/chime6/s5b_track2/local/ts-vad/convert_prob_to_rttm.py
new file mode 100644
index 00000000000..0934f8d2a81
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/ts-vad/convert_prob_to_rttm.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+
+# Copyright  2020  Yuri Khokhlov, Ivan Medennikov (STC-innovations Ltd)
+# Apache 2.0.
+
+"""This script converts TS-VAD output probabilities to a NIST RTTM file.
+
+The segments file format is:
+<segment-id> <recording-id> <start-time> <end-time>
+The labels file format is:
+<segment-id> <speaker-id>
+
+The output RTTM format is:
+<type> <file> <chnl> <tbeg> \
+        <tdur> <ortho> <stype> <name> <conf> <slat>
+where:
+<type> = "SPEAKER"
+<file> = <recording-id>
+<chnl> = "0"
+<tbeg> = start time of segment
+<tdur> = duration of segment
+<ortho> = "<NA>"
+<stype> = "<NA>"
+<name> = <speaker-id>
+<conf> = "<NA>"
+<slat> = "<NA>"
+"""
+
+
+import os
+import argparse
+import regex as re
+import numpy as np
+from scipy import signal, ndimage
+from kaldiio import ReadHelper
+
+
+class Segment:
+    def __init__(self, begin, end, label):
+        self.begin = begin
+        self.end = end
+        self.label = label
+
+    def length(self):
+        return self.end - self.begin
+
+
+class VadProbSet:
+    def __init__(self, vad_rspec, reg_exp):
+        data = dict()
+        prev = -1
+        with ReadHelper(vad_rspec) as reader:
+            for utid, prob in reader:
+                result = reg_exp.match(utid)
+                assert result is not None, 'Wrong utterance ID format: \"{}\"'.format(utid)
+                sess_indx = result.group(1)
+                spkr = result.group(2)
+
+                result = reg_exp.match(sess_indx)
+                assert result is not None, 'Wrong utterance ID format: \"{}\"'.format(sess_indx)
+                sess = result.group(1)
+                indx = int(result.group(2))
+
+                sess = sess + '-' + spkr
+
+                if sess not in data.keys():
+                    assert indx == 1
+                    prev = -1
+                    data[sess] = list()
+                assert indx >= prev
+                data[sess].append(prob)
+                prev = indx
+            reader.close()
+        print('  loaded {} sessions'.format(len(data)))
+        print('  combining fragments')
+        self.data = dict()
+        for sess, items in data.items():
+            self.data[sess] = np.hstack(items)
+
+    def apply_filter(self, window, threshold, threshold_first):
+        for sess in self.data.keys():
+            if threshold_first:
+                self.data[sess] = np.vectorize(lambda value: 1.0 if value > threshold else 0.0)(self.data[sess]).astype(dtype=np.int32)
+                if window > 1:
+                    self.data[sess] = signal.medfilt(self.data[sess], window).astype(dtype=np.int32)
+            else:
+                if window > 1:
+                    self.data[sess] = signal.medfilt(self.data[sess], window)
+                self.data[sess] = np.vectorize(lambda value: 1.0 if value > threshold else 0.0)(self.data[sess]).astype(dtype=np.int32)
+
+    def convert(self, frame_shift,  min_silence, min_speech, out_rttm):
+        min_silence = int(round(min_silence / frame_shift))
+        min_speech = int(round(min_speech / frame_shift))
+        with open(out_rttm, 'wt', encoding='utf-8') as wstream:
+            for sess, prob in self.data.items():
+                print('  session: {}  num_frames: {}  duration: {:.2f} hrs'.format(sess, len(prob), len(prob) * frame_shift / 60 / 60))
+                segments = list()
+                for i, label in enumerate(prob):
+                    if (len(segments) == 0) or (segments[-1].label != label):
+                        segments.append(Segment(i, i + 1, label))
+                    else:
+                        segments[-1].end += 1
+                if (min_silence > 0) or (min_speech > 0):
+                    items = segments
+                    segments = list()
+                    for segm in items:
+                        if len(segments) == 0:
+                            segments.append(segm)
+                        elif segm.label == segments[-1].label:
+                            segments[-1].end = segm.end
+                        else:
+                            min_length = min_silence if segm.label == 0 else min_speech
+                            if segm.length() < min_length:
+                                segments[-1].end = segm.end
+                            else:
+                                segments.append(segm)
+                for segm in segments:
+                    if segm.label == 1:
+                        begin = frame_shift * segm.begin
+                        length = frame_shift * segm.length()
+                        result = reg_exp.match(sess)
+                        assert result is not None, 'Wrong format: \"{}\"'.format(sess)
+                        utid = result.group(1)
+                        spk = result.group(2)
+                        wstream.write('SPEAKER {} 1 {:7.3f} {:7.3f} <NA> <NA> {} <NA> <NA>\n'.format(utid, begin, length, spk))
+        wstream.close()
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Usage: convert_prob_to_wa.py <vad-rspec> <rttm>')
+    parser.add_argument("--frame_shift", "-s", type=float, default=0.010)
+    parser.add_argument("--reg_exp", "-x", type=str, default=r'^(\S+)-(\d+)$')
+    parser.add_argument("--window", "-w", type=int, default=1)
+    parser.add_argument("--threshold", "-t", type=float, default=0.5)
+    parser.add_argument("--threshold_first", "-r", action="store_true")
+    parser.add_argument("--min_silence", "-k", type=float, default=0.0)
+    parser.add_argument("--min_speech", "-m", type=float, default=0.0)
+    parser.add_argument('vad_rspec', type=str)
+    parser.add_argument('out_rttm', type=str)
+    args = parser.parse_args()
+
+    print('Options:')
+    print('  Frame shift in sec:  {}'.format(args.frame_shift))
+    print('  Utterance ID regexp: {}'.format(args.reg_exp))
+    print('  Med. filter window:  {}'.format(args.window))
+    print('  Prob. threshold:     {}'.format(args.threshold))
+    print('  Apply thresh. first: {}'.format(args.threshold_first))
+    print('  Min silence length:  {}'.format(args.min_silence))
+    print('  Min speech length:   {}'.format(args.min_speech))
+    print('  VAD rspec:           {}'.format(args.vad_rspec))
+    print('  Output rttm file:    {}'.format(args.out_rttm))
+
+    reg_exp = re.compile(args.reg_exp)
+
+    parent = os.path.dirname(os.path.abspath(args.out_rttm))
+    if not os.path.exists(parent):
+        os.makedirs(parent)
+
+    print('Loading VAD probabilities')
+    vad_prob = VadProbSet(args.vad_rspec, reg_exp)
+
+    print('Applying filtering')
+    vad_prob.apply_filter(args.window, args.threshold, args.threshold_first)
+
+    print('Writing rttm')
+    vad_prob.convert(args.frame_shift, args.min_silence, args.min_speech, args.out_rttm)
diff --git a/egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it1.sh b/egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it1.sh
new file mode 100755
index 00000000000..277a1ebaa4d
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it1.sh
@@ -0,0 +1,154 @@
+#!/bin/bash
+# Copyright   2020   Ivan Medennikov (STC-innovations Ltd)
+
+# Apache 2.0.
+#
+# This script performs 1st iteration of TS-VAD diarization
+# using an initial diarization rttm to estimate i-vectors
+
+cmd="run.pl"
+ref_rttm=
+lang=data/lang
+
+#blstm processing parameters
+extra_left_context=30
+extra_right_context=30
+frames_per_chunk=40
+
+#post-processing parameters
+thr=0.4
+window=51
+min_silence=0.3
+min_speech=0.2
+
+nj=8
+nj_feats=2
+piece=10000 
+
+ivector_affix=baseline-init
+
+echo "$0 $@"  # Print the command line for logging
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+if [ $# != 4 ]; then
+  echo "Usage: $0 <ts-vad-dir> <ivector-dir> <initname> <out-dir>"
+  echo "e.g.: $0 exp/ts-vad exp/nnet3 dev_beamformit_dereverb_diarized exp/ts-vad/it1"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs."
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --ref_rttm ./chime6_rttm/dev_rttm                # the location of the reference RTTM file"
+  echo "  --ivector_affix baseline-init                    # affix corresponding to the initial diarization"
+  echo "  --piece 10000                                    # raw wavs will be splitted into non-overlapping pieces of this size (in frames)"
+  echo "  --thr 0.4                                        # post-processing: probability threshold"
+  echo "  --window 51                                      # post-processing: median filter window (in frames)"
+  echo "  --min_silence 0.3                                # post-processing: minimum length of silence (in seconds)"
+  echo "  --min_speech 0.2                                 # post-processing: minimum length of speech (in seconds)"
+  exit 1;
+fi
+
+dir=$1
+ivector_dir=$2
+initname=$3
+outdir=$4
+
+test="$(cut -d'_' -f1 <<<"$initname")"
+
+#estimating i-vectors using the initial diarization
+dset=${initname}_hires
+ivdir=${ivector_dir}/ivectors_${dset}_${ivector_affix}
+if [ ! -f $ivdir/ivector_online.scp ]; then
+  echo "Extracting i-vectors for $dset"
+  steps/online/nnet2/extract_ivectors.sh --cmd "$cmd" --nj $nj \
+    --silence-weight 0.00001 \
+    --sub-speaker-frames 0 --max-count 100 \
+    data/$dset $lang $ivector_dir/extractor $ivdir || exit 1;
+fi
+
+#preparing 4-speaker track2 data
+dsetsrc=$dset
+name=$(echo $initname | sed s/_diarized//)
+dset=${name}_U06_hires
+if [ ! -f data/$dset/.done ]; then
+  mkdir -p data/$dset
+  cp data/$dsetsrc/wav.scp data/$dset/wav.scp
+  awk '{print $1" "$1}' data/$dset/wav.scp > data/$dset/utt2spk
+  awk '{print $1" "$1}' data/$dset/wav.scp > data/$dset/spk2utt
+  utils/fix_data_dir.sh data/$dset
+  steps/make_mfcc.sh --nj $nj_feats --mfcc-config conf/mfcc_hires.conf data/$dset data/$dset/log data/$dset/data || exit 1;
+  touch data/$dset/.done
+fi
+
+#splitting 4-speaker track2 data into pieces
+dsetsrc=$dset
+dset=${dset}_split${piece}
+if [ ! -f data/$dset/.done ]; then
+  mkdir -p data/$dset
+  cp data/${dsetsrc}/wav.scp data/$dset
+  feat-to-len scp:data/$dsetsrc/feats.scp ark,t:data/$dsetsrc/utt2len
+  local/ts-vad/split_feats_seg.pl data/$dsetsrc/feats.scp data/$dsetsrc/utt2spk data/$dsetsrc/utt2len $piece data/$dset/feats.scp data/$dset/utt2spk data/$dset/segments
+  utils/utt2spk_to_spk2utt.pl data/$dset/utt2spk > data/$dset/spk2utt
+  utils/fix_data_dir.sh data/$dset
+  touch data/$dset/.done
+fi
+
+#preparing 4-speaker i-vectors
+iv4dir=${ivector_dir}/ivectors-4spk_${dset}_${ivector_affix}
+if [ ! -f $iv4dir/.done ]; then
+  mkdir -p $iv4dir
+  echo "Making pseudo-online 4spk i-vectors using source $ivdir"
+  cat $ivdir/ivectors_spk.*.ark > $iv4dir/ivectors_spk.ark
+
+  for spk in `seq 4`; do
+    awk -v "spk=$spk" '{printf "%s %s-%s\n", $1, $2, spk}' data/$dset/utt2spk > data/$dset/utt2spk.$spk
+  done
+
+  $train_cmd JOB=1:4 $iv4dir/log/apply-map.JOB.log \
+    utils/apply_map.pl -f 2 $iv4dir/ivectors_spk.ark \<data/$dset/utt2spk.JOB \>$iv4dir/ivectors_utt.JOB.ark || exit 1;
+
+  ivector_dim=$[$(head -n 1 $ivdir/ivectors_spk.1.ark | wc -w) - 3] || exit 1;
+  base_feat_dim=$(feat-to-dim scp:data/$dset/feats.scp -) || exit 1;
+  start_dim=$base_feat_dim
+  end_dim=$[$base_feat_dim+$ivector_dim-1]
+  absdir=$(utils/make_absolute.sh $iv4dir)
+  cp $ivdir/{ivector_period,final.ie.id} $iv4dir/
+  ivector_period=$(cat $ivdir/ivector_period)
+
+  $cmd JOB=1:4 $iv4dir/log/duplicate_feats.JOB.log \
+    append-vector-to-feats scp:data/$dset/feats.scp ark:$iv4dir/ivectors_utt.JOB.ark ark:- \| \
+    select-feats "$start_dim-$end_dim" ark:- ark:- \| \
+    subsample-feats --n=$ivector_period ark:- ark:- \| \
+    copy-feats --compress=true ark:- \
+    ark,scp:$absdir/ivector_online.JOB.ark,$absdir/ivector_online.JOB.scp || exit 1;
+
+  $cmd $iv4dir/log/paste-feats.log \
+    paste-feats scp:$iv4dir/ivector_online.1.scp scp:$iv4dir/ivector_online.2.scp scp:$iv4dir/ivector_online.3.scp scp:$iv4dir/ivector_online.4.scp ark:- \| \
+    copy-feats --compress=true ark:- ark,scp:$absdir/ivector_online.ark,$absdir/ivector_online.scp || exit 1;
+  touch $iv4dir/.done
+fi
+
+#computing TS-VAD per-frame probabilities for each speaker
+out=$outdir/$dset
+if [ ! -f $out/.done ]; then
+  local/ts-vad/compute_ts-vad_weights.sh --nj $nj_feats --use-gpu true --cmd "$cmd" --online-ivector-dir $iv4dir \
+    --extra-left-context $extra_left_context --extra-right-context $extra_right_context --frames-per-chunk $frames_per_chunk \
+    data/$dset $dir/final.raw $out || exit 1;
+  touch $out/.done
+fi
+
+#TS-VAD probabilities post-processing and DER scoring
+scoring=$out/scoring
+hyp_rttm=$scoring/rttm
+if [ ! -f $scoring/.done ]; then
+  if [ ! -f $hyp_rttm ]; then 
+    python local/ts-vad/convert_prob_to_rttm.py --threshold $thr --window $window --min_silence $min_silence --min_speech $min_speech ark:"sort $out/weights.ark |" $hyp_rttm || exit 1;
+  fi
+  echo "Diarization results for $test"
+  [ ! -f $ref_rttm.scoring ] && sed 's/_U0[1-6]\.ENH//g' $ref_rttm > $ref_rttm.scoring
+  [ ! -f $hyp_rttm.scoring ] && sed 's/_U0[1-6]\.ENH//g' $hyp_rttm > $hyp_rttm.scoring
+  ref_rttm_path=$(readlink -f ${ref_rttm}.scoring)
+  hyp_rttm_path=$(readlink -f ${hyp_rttm}.scoring)
+  [ ! -f ./local/uem_file.scoring ] && cat ./local/uem_file | grep 'U06' | sed 's/_U0[1-6]//g' > ./local/uem_file.scoring
+  cd dscore && python score.py -u ../local/uem_file.scoring -r $ref_rttm_path \
+    -s $hyp_rttm_path 2>&1 | tee -a ../$scoring/DER && cd .. || exit 1;
+  touch $scoring/.done
+fi
diff --git a/egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it2.sh b/egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it2.sh
new file mode 100755
index 00000000000..15f328c206e
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/ts-vad/diarize_TS-VAD_it2.sh
@@ -0,0 +1,208 @@
+#!/bin/bash
+# Copyright   2020   Ivan Medennikov
+
+# Apache 2.0.
+#
+# This script performs 2nd and further iterations of TS-VAD diarization
+# on a set of kinect channels followed by averaging.
+# Probabilities from the previous iteration are used to estimate i-vectors
+
+cmd="run.pl"
+ref_rttm=
+lang=data/lang
+audio_dir=CHiME6/audio
+
+#blstm processing parameters
+extra_left_context=30
+extra_right_context=30
+frames_per_chunk=40
+
+#post-processing parameters
+thr=0.4
+window=51
+min_silence=0.3
+min_speech=0.2
+
+nj=8
+nj_feats=2
+piece=10000 #raw wavs will be splitted into non-overlapping pieces of this size (in frames)
+ups=18
+wpeid=
+channels="CH1 CH2 CH3 CH4"
+
+#parameters for modification of initial weights
+t=0
+mt=0.7 
+
+it=2
+ivector_affix=it1-init
+
+echo "$0 $@"  # Print the command line for logging
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+if [ $# != 4 ]; then
+  echo "Usage: $0 <ts-vad-dir> <ivector-dir> <initname> <out-dir>"
+  echo "e.g.: $0 exp/ts-vad exp/nnet3 exp/ts-vad/it1 exp/ts-vad/it2"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs."
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --ref_rttm ./chime6_rttm/dev_rttm                # the location of the reference RTTM file"
+  echo "  --it 2                                           # current iteration of TS-VAD"
+  echo "  --ivector_affix it1-init                         # affix corresponding to the initial weights"
+  echo "  --channels CH1 CH2 CH3 CH4                       # kinect channels to be processed"
+  echo "  --audio_dir CHiME6/audio                         # path to wav files"
+  echo "  --wpeid WPE2m                                    # affix for non-original wavs, e.g., blockwise WPE processed"
+  echo "  --piece 10000                                    # raw wavs will be splitted into non-overlapping pieces of this size (in frames)"
+  echo "  --ups 18                                         # number of pieces considered as one speaker"
+  echo "  --t 0                                            # absolute threshold for initial weights"
+  echo "  --mt 0.7                                         # relative threshold for pi/(p1+p2+p3+p4) in initial weights (to exclude overlapping regions from i-vectors estimation)"
+  echo "  --thr 0.4                                        # post-processing: probability threshold"
+  echo "  --window 51                                      # post-processing: median filter window (in frames)"
+  echo "  --min_silence 0.3                                # post-processing: minimum length of silence (in seconds)"
+  echo "  --min_speech 0.2                                 # post-processing: minimum length of speech (in seconds)"
+  exit 1;
+fi
+
+dir=$1
+ivector_dir=$2
+initdir=$3
+outdir=$4
+
+initname=$(basename $initdir)
+test="$(cut -d'_' -f1 <<<"$initname")"
+
+weights=$initdir/weights.ark
+weights_mod=$initdir/weights_t${t}_mt${mt}.ark
+if [ ! -f ${weights_mod}.gz ]; then
+  python local/ts-vad/vad_prob_mod.py --threshold $t --multispk_threshold $mt ark:$weights ark,t:${weights_mod}
+  cat ${weights_mod} | sed s/_U06.ENH// | sort | gzip -c > ${weights_mod}.gz
+  rm $weights_mod
+fi
+for spk in `seq 4`; do
+  [ ! -f ${weights_mod}.${spk}.gz ] && gunzip -c ${weights_mod}.gz | grep "\-$spk\ " | sed s/\-$spk\ /\ / | gzip -c > ${weights_mod}.${spk}.gz
+done
+
+kinects="U01 U02 U03 U04 U05 U06"
+[ "$test" == "dev" ] && kinects="U01 U02 U03 U04 U06"
+[ "$test" == "eval" ] && kinects="U01 U02 U04 U05 U06"
+ 
+sum_scps=""
+n=0
+for u in $kinects; do
+  for ch in $channels; do
+    id=${u}.${ch}${wpeid}
+    echo "processing $id"
+
+    dset=${test}_${id}_hires
+    if [ ! -f data/$dset/.done ]; then
+      mkdir -p data/$dset
+      ls $audio_dir/$test/ | grep "wav" | grep "$u" | grep "$ch" | awk -v "pth=$audio_dir/$test" '{printf "%s %s/%s\n", $1, pth, $1}' | sed -E s/_[^\ ]+// > data/$dset/wav.scp
+      awk '{print $1" "$1}' data/$dset/wav.scp > data/$dset/utt2spk
+      awk '{print $1" "$1}' data/$dset/wav.scp > data/$dset/spk2utt
+      utils/fix_data_dir.sh data/$dset
+      steps/make_mfcc.sh --nj $nj_feats --mfcc-config conf/mfcc_hires.conf data/$dset data/$dset/log data/$dset/data || exit 1;
+      touch data/$dset/.done
+    fi
+
+    dsetsrc=$dset
+    dset=${dset}_split${piece}
+    if [ ! -f data/$dset/.done ]; then
+      mkdir -p data/$dset
+      cp data/${dsetsrc}/wav.scp data/$dset
+      feat-to-len scp:data/$dsetsrc/feats.scp ark,t:data/$dsetsrc/utt2len
+      local/ts-vad/split_feats_seg.pl data/$dsetsrc/feats.scp data/$dsetsrc/utt2spk data/$dsetsrc/utt2len $piece data/$dset/feats.scp data/$dset/utt2spk data/$dset/segments
+      utils/utt2spk_to_spk2utt.pl data/$dset/utt2spk > data/$dset/spk2utt
+      utils/fix_data_dir.sh data/$dset
+      touch data/$dset/.done
+    fi
+
+    dsetsrc=$dset
+    dset=${dset}_${ups}ups
+    if [ ! -f data/$dset/.done ]; then
+      utils/copy_data_dir.sh data/$dsetsrc data/$dset
+      local/ts-vad/modify_ups_utt2spk.pl data/$dsetsrc/utt2spk $ups data/$dset/utt2spk
+      utils/utt2spk_to_spk2utt.pl data/$dset/utt2spk > data/$dset/spk2utt
+      utils/fix_data_dir.sh data/$dset
+      touch data/$dset/.done
+    fi
+
+    ivdir=${ivector_dir}/${test}_${ivector_affix}/ivectors_${dset}
+    for spk in `seq 4`; do
+      if [ ! -f $ivdir/$spk/ivector_online.scp ]; then
+        echo "Extracting i-vectors for $dset"
+        steps/online/nnet2/extract_ivectors.sh --cmd "$decode_cmd" --nj $nj \
+          --silence-weight 0.00001 \
+          --sub-speaker-frames 0 --max-count 100 \
+          data/$dset $lang $ivector_dir/extractor ${weights_mod}.${spk}.gz $ivdir/$spk || exit 1;
+      fi
+    done
+
+    iv4dir=${ivector_dir}/${test}_${ivector_affix}/ivectors-4spk_${dset}
+    if [ ! -f $iv4dir/.done ]; then
+      mkdir -p $iv4dir
+      echo "Making pseudo-online 4spk i-vectors using source $ivdir"
+      for spk in `seq 4`; do
+        cat $ivdir/$spk/ivectors_spk.*.ark > $iv4dir/ivectors_spk.$spk.ark
+      done
+      $train_cmd JOB=1:4 $iv4dir/log/apply-map.JOB.log \
+        utils/apply_map.pl -f 2 $iv4dir/ivectors_spk.JOB.ark \<data/$dset/utt2spk \>$iv4dir/ivectors_utt.JOB.ark || exit 1;
+
+      ivector_dim=$[$(head -n 1 $iv4dir/ivectors_spk.1.ark | wc -w) - 3] || exit 1;
+      base_feat_dim=$(feat-to-dim scp:data/$dset/feats.scp -) || exit 1;
+      start_dim=$base_feat_dim
+      end_dim=$[$base_feat_dim+$ivector_dim-1]
+      absdir=$(utils/make_absolute.sh $iv4dir)
+      cp $ivdir/1/{ivector_period,final.ie.id} $iv4dir/
+      ivector_period=$(cat $iv4dir/ivector_period)
+
+      $train_cmd JOB=1:4 $iv4dir/log/duplicate_feats.JOB.log \
+        append-vector-to-feats scp:data/$dset/feats.scp ark:$iv4dir/ivectors_utt.JOB.ark ark:- \| \
+        select-feats "$start_dim-$end_dim" ark:- ark:- \| \
+        subsample-feats --n=$ivector_period ark:- ark:- \| \
+        copy-feats --compress=true ark:- \
+        ark,scp:$absdir/ivector_online.JOB.ark,$absdir/ivector_online.JOB.scp || exit 1;
+
+      $train_cmd $iv4dir/log/paste-feats.log \
+        paste-feats scp:$iv4dir/ivector_online.1.scp scp:$iv4dir/ivector_online.2.scp scp:$iv4dir/ivector_online.3.scp scp:$iv4dir/ivector_online.4.scp ark:- \| \
+        copy-feats --compress=true ark:- ark,scp:$absdir/ivector_online.ark,$absdir/ivector_online.scp || exit 1;
+      touch $iv4dir/.done
+    fi
+
+    out=$outdir/$dset
+    if [ ! -f $out/.done ]; then
+      local/ts-vad/compute_ts-vad_weights.sh --nj $nj --use-gpu true --cmd "$decode_cmd" --online-ivector-dir $iv4dir \
+        --extra-left-context $extra_left_context --extra-right-context $extra_right_context --frames-per-chunk $frames_per_chunk \
+        data/$dset $dir/final.raw $out || exit 1;
+      touch $out/.done
+    fi
+    sum_scps="${sum_scps}ark:$out/weights.ark "
+    n=$((n+1))
+  done
+done
+
+id=${n}ch-AVG${wpeid}
+dset=${test}_${id}_hires_split${piece}_${ups}ups
+out=$outdir/$dset
+if [ ! -f $out/.done ]; then
+  scale=$(awk -v "n=$n" 'BEGIN {print 1/n}')
+  $train_cmd $out/log/vector-sum.log \
+    vector-sum $sum_scps ark:- \| vector-scale --scale=$scale ark:- ark,t:$out/weights.ark || exit 1;
+  touch $out/.done
+fi
+
+scoring=$out/scoring
+hyp_rttm=$scoring/rttm
+if [ ! -f $scoring/.done ]; then
+  if [ ! -f $hyp_rttm ]; then 
+    python local/ts-vad/convert_prob_to_rttm.py --threshold $thr --window $window --min_silence $min_silence --min_speech $min_speech ark:"sort $out/weights.ark |" $hyp_rttm || exit 1;
+  fi
+  echo "Diarization results for $test"
+  [ ! -f $ref_rttm.scoring ] && sed 's/_U0[1-6]\.ENH//g' $ref_rttm > $ref_rttm.scoring
+  [ ! -f $hyp_rttm.scoring ] && sed 's/_U0[1-6]\.ENH//g' $hyp_rttm > $hyp_rttm.scoring
+  ref_rttm_path=$(readlink -f ${ref_rttm}.scoring)
+  hyp_rttm_path=$(readlink -f ${hyp_rttm}.scoring)
+  [ ! -f ./local/uem_file.scoring ] && cat ./local/uem_file | grep 'U06' | sed 's/_U0[1-6]//g' > ./local/uem_file.scoring
+  cd dscore && python score.py -u ../local/uem_file.scoring -r $ref_rttm_path \
+    -s $hyp_rttm_path 2>&1 | tee -a ../$scoring/DER && cd .. || exit 1;
+  touch $scoring/.done
+fi
diff --git a/egs/chime6/s5b_track2/local/ts-vad/extract_ivectors.sh b/egs/chime6/s5b_track2/local/ts-vad/extract_ivectors.sh
new file mode 100755
index 00000000000..e9b3b95c178
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/ts-vad/extract_ivectors.sh
@@ -0,0 +1,296 @@
+#!/bin/bash
+
+# Copyright     2013  Daniel Povey
+# Apache 2.0.
+
+
+# This script computes iVectors in the same format as extract_ivectors_online.sh,
+# except that they are actually not really computed online, they are first computed
+# per speaker and just duplicated many times.
+# This is mainly intended for use in decoding, where you want the best possible
+# quality of iVectors.
+#
+# This setup also makes it possible to use a previous decoding or alignment, to
+# down-weight silence in the stats (default is --silence-weight 0.0).
+#
+# This is for when you use the "online-decoding" setup in an offline task, and
+# you want the best possible results.
+# Compared to the steps/online/nnet2/extract_ivectors.sh, this script uses 
+# modified apply_map.pl with permissive mode.
+
+# Begin configuration section.
+nj=30
+cmd="run.pl"
+stage=0
+num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select
+min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
+ivector_period=10
+posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
+                    # inter-frame correlations.  Making this small during iVector
+                    # extraction is equivalent to scaling up the prior, and will
+                    # will tend to produce smaller iVectors where data-counts are
+                    # small.  It's not so important that this match the value
+                    # used when training the iVector extractor, but more important
+                    # that this match the value used when you do real online decoding
+                    # with the neural nets trained with these iVectors.
+max_count=100       # Interpret this as a number of frames times posterior scale...
+                    # this config ensures that once the count exceeds this (i.e.
+                    # 1000 frames, or 10 seconds, by default), we start to scale
+                    # down the stats, accentuating the prior term.   This seems quite
+                    # important for some reason.
+sub_speaker_frames=0  # If >0, during iVector estimation we split each speaker
+                      # into possibly many 'sub-speakers', each with at least
+                      # this many frames of speech (evaluated after applying
+                      # silence_weight, so will typically exclude silence.
+                      # e.g. set this to 1000, and it will require at least 10 seconds
+                      # of speech per sub-speaker.
+
+compress=true       # If true, compress the iVectors stored on disk (it's lossy
+                    # compression, as used for feature matrices).
+silence_weight=0.0
+acwt=0.1  # used if input is a decode dir, to get best path from lattices.
+mdl=final  # change this if decode directory did not have ../final.mdl present.
+num_threads=1 # Number of threads used by ivector-extract.  It is usually not
+              # helpful to set this to > 1.  It is only useful if you have
+              # fewer speakers than the number of jobs you want to run.
+
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 4 ] && [ $# != 5 ]; then
+  echo "Usage: $0 [options] <data> <lang> <extractor-dir> [<alignment-dir>|<decode-dir>|<weights-archive>] <ivector-dir>"
+  echo " e.g.: $0 data/test data/lang exp/nnet2_online/extractor exp/tri3/decode_test exp/nnet2_online/ivectors_test"
+  echo "If <alignment-dir|decode-dir> is provided, it is converted to frame-weights "
+  echo "giving silence frames a weight of --silence-weight (default: 0.0). "
+  echo "If <weights-archive> is provided, it must be a single archive file compressed "
+  echo "(using gunzip) containing per-frame weights for each utterance."
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --nj <n|10>                                      # Number of jobs (also see num-threads)"
+  echo "  --num-threads <n|1>                              # Number of threads for each job"
+  echo "                                                   # Ignored if <alignment-dir> or <decode-dir> supplied."
+  echo "  --stage <stage|0>                                # To control partial reruns"
+  echo "  --num-gselect <n|5>                              # Number of Gaussians to select using"
+  echo "                                                   # diagonal model."
+  echo "  --min-post <float;default=0.025>                 # Pruning threshold for posteriors"
+  echo "  --ivector-period <int;default=10>                # How often to extract an iVector (frames)"
+  echo "  --posterior-scale <float;default=0.1>            # Scale on posteriors in iVector extraction; "
+  echo "                                                   # affects strength of prior term."
+
+  exit 1;
+fi
+
+if [ $# -eq 4 ]; then
+  data=$1
+  lang=$2
+  srcdir=$3
+  dir=$4
+else # 5 arguments
+  data=$1
+  lang=$2
+  srcdir=$3
+  ali_or_decode_dir_or_weights=$4
+  dir=$5
+fi
+
+for f in $data/feats.scp $srcdir/final.ie $srcdir/final.dubm $srcdir/global_cmvn.stats $srcdir/splice_opts \
+  $lang/phones.txt $srcdir/online_cmvn.conf $srcdir/final.mat; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+mkdir -p $dir/log
+silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+
+if [ ! -z "$ali_or_decode_dir_or_weights" ]; then
+
+
+  if [ -f $ali_or_decode_dir_or_weights/ali.1.gz ]; then
+    if [ ! -f $ali_or_decode_dir_or_weights/${mdl}.mdl ]; then
+      echo "$0: expected $ali_or_decode_dir_or_weights/${mdl}.mdl to exist."
+      exit 1;
+    fi
+    nj_orig=$(cat $ali_or_decode_dir_or_weights/num_jobs) || exit 1;
+
+    if [ $stage -le 0 ]; then
+      rm $dir/weights.*.gz 2>/dev/null
+
+      $cmd JOB=1:$nj_orig  $dir/log/ali_to_post.JOB.log \
+        gunzip -c $ali_or_decode_dir_or_weights/ali.JOB.gz \| \
+        ali-to-post ark:- ark:- \| \
+        weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir_or_weights/final.mdl ark:- ark:- \| \
+        post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1;
+
+      # put all the weights in one archive.
+      for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1;
+      rm $dir/weights.*.gz || exit 1;
+    fi
+
+  elif [ -f $ali_or_decode_dir_or_weights/lat.1.gz ]; then
+    nj_orig=$(cat $ali_or_decode_dir_or_weights/num_jobs) || exit 1;
+    if [ ! -f $ali_or_decode_dir_or_weights/../${mdl}.mdl ]; then
+      echo "$0: expected $ali_or_decode_dir_or_weights/../${mdl}.mdl to exist."
+      exit 1;
+    fi
+
+
+    if [ $stage -le 0 ]; then
+      rm $dir/weights.*.gz 2>/dev/null
+
+      $cmd JOB=1:$nj_orig  $dir/log/lat_to_post.JOB.log \
+        lattice-best-path --acoustic-scale=$acwt "ark:gunzip -c $ali_or_decode_dir_or_weights/lat.JOB.gz|" ark:/dev/null ark:- \| \
+        ali-to-post ark:- ark:- \| \
+        weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir_or_weights/../${mdl}.mdl ark:- ark:- \| \
+        post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1;
+
+      # put all the weights in one archive.
+      for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1;
+      rm $dir/weights.*.gz || exit 1;
+    fi
+  elif [ -f $ali_or_decode_dir_or_weights ] && gunzip -c $ali_or_decode_dir_or_weights >/dev/null; then
+    cp $ali_or_decode_dir_or_weights $dir/weights.gz || exit 1;
+  else
+    echo "$0: expected ali.1.gz or lat.1.gz to exist in $ali_or_decode_dir_or_weights";
+    exit 1;
+  fi
+fi
+
+sdata=$data/split$nj;
+utils/split_data.sh $data $nj || exit 1;
+
+echo $ivector_period > $dir/ivector_period || exit 1;
+splice_opts=$(cat $srcdir/splice_opts)
+
+gmm_feats="ark,s,cs:apply-cmvn-online --spk2utt=ark:$sdata/JOB/spk2utt --config=$srcdir/online_cmvn.conf $srcdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+feats="ark,s,cs:splice-feats $splice_opts scp:$sdata/JOB/feats.scp ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+
+# This adds online-cmvn in $feats, upon request (configuration taken from UBM),
+[ -f $srcdir/online_cmvn_iextractor ] && feats="$gmm_feats"
+
+
+if [ $sub_speaker_frames -gt 0 ]; then
+
+  if [ $stage -le 1 ]; then
+  # We work out 'fake' spk2utt files that possibly split each speaker into multiple pieces.
+    if [ ! -z "$ali_or_decode_dir_or_weights" ]; then
+      gunzip -c $dir/weights.gz | copy-vector ark:- ark,t:- | \
+        awk '{ sum=0; for (n=3;n<NF;n++) sum += $n; print $1, sum; }' > $dir/utt_counts || exit 1;
+    else
+      feat-to-len scp:$data/feats.scp ark,t:- > $dir/utt_counts || exit 1;
+    fi
+    if ! [ $(wc -l <$dir/utt_counts) -eq $(wc -l <$data/feats.scp) ]; then
+      echo "$0: error getting per-utterance counts."
+#      exit 0;
+    fi
+#   cat $data/spk2utt | python -c "
+    utils/filter_scp.pl $dir/utt_counts $data/utt2spk | utils/utt2spk_to_spk2utt.pl | python -c "
+import sys
+utt_counts = {}
+trash = list(map(lambda x: utt_counts.update({x.split()[0]:float(x.split()[1])}), open('$dir/utt_counts').readlines()))
+sub_speaker_frames = $sub_speaker_frames
+lines = sys.stdin.readlines()
+total_counts = {}
+for line in lines:
+  parts = line.split()
+  spk = parts[0]
+  total_counts[spk] = 0
+  for utt in parts[1:]:
+    total_counts[spk] += utt_counts[utt]
+
+for line_index in range(len(lines)):
+  line = lines[line_index]
+  parts = line.split()
+  spk = parts[0]
+
+  numeric_id=0
+  current_count = 0
+  covered_count = 0
+  current_utts = []
+  for utt in parts[1:]:
+    try:
+      current_count += utt_counts[utt]
+      covered_count += utt_counts[utt]
+    except KeyError:
+      raise Exception('No count found for the utterance {0}.'.format(utt))
+    current_utts.append(utt)
+    if ((current_count >= $sub_speaker_frames) and ((total_counts[spk] - covered_count) >= $sub_speaker_frames)) or (utt == parts[-1]):
+      spk_partial = '{0}-{1:06x}'.format(spk, numeric_id)
+      numeric_id += 1
+      print ('{0} {1}'.format(spk_partial, ' '.join(current_utts)))
+      current_utts = []
+      current_count = 0
+"> $dir/spk2utt || exit 1;
+    mkdir -p $dir/split$nj
+    # create split versions of our spk2utt file.
+    for j in $(seq $nj); do
+      mkdir -p $dir/split$nj/$j
+      utils/filter_scp.pl -f 2 $sdata/$j/utt2spk <$dir/spk2utt >$dir/split$nj/$j/spk2utt || exit 1;
+      utils/spk2utt_to_utt2spk.pl <$dir/split$nj/$j/spk2utt >$dir/split$nj/$j/utt2spk || exit 1;
+    done
+  fi
+  this_sdata=$dir/split$nj
+else
+  this_sdata=$sdata
+fi
+
+if [ $stage -le 2 ]; then
+  if [ ! -z "$ali_or_decode_dir_or_weights" ]; then
+    $cmd --num-threads $num_threads JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
+      gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \
+      weight-post ark:- "ark,s,cs:gunzip -c $dir/weights.gz|" ark:- \| \
+      ivector-extract --num-threads=$num_threads --acoustic-weight=$posterior_scale --compute-objf-change=true \
+        --max-count=$max_count --spk2utt=ark:$this_sdata/JOB/spk2utt \
+      $srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_spk.JOB.ark || exit 1;
+  else
+    $cmd --num-threads $num_threads JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
+      gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \
+      ivector-extract --num-threads=$num_threads --acoustic-weight=$posterior_scale --compute-objf-change=true \
+        --max-count=$max_count --spk2utt=ark:$this_sdata/JOB/spk2utt \
+      $srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_spk.JOB.ark || exit 1;
+  fi
+fi
+
+# get an utterance-level set of iVectors (just duplicate the speaker-level ones).
+# note: if $this_sdata is set $dir/split$nj, then these won't be real speakers, they'll
+# be "sub-speakers" (speakers split up into multiple utterances).
+if [ $stage -le 3 ]; then
+  for j in $(seq $nj); do
+    local/ts-vad/apply_map.pl --permissive -f 2 $dir/ivectors_spk.$j.ark <$this_sdata/$j/utt2spk >$dir/ivectors_utt.$j.ark || exit 1;
+  done
+fi
+
+ivector_dim=$[$(head -n 1 $dir/ivectors_spk.1.ark | wc -w) - 3] || exit 1;
+echo  "$0: iVector dim is $ivector_dim"
+
+base_feat_dim=$(feat-to-dim scp:$data/feats.scp -) || exit 1;
+
+start_dim=$base_feat_dim
+end_dim=$[$base_feat_dim+$ivector_dim-1]
+absdir=$(utils/make_absolute.sh $dir)
+
+if [ $stage -le 4 ]; then
+  # here, we are just using the original features in $sdata/JOB/feats.scp for
+  # their number of rows; we use the select-feats command to remove those
+  # features and retain only the iVector features.
+  $cmd JOB=1:$nj $dir/log/duplicate_feats.JOB.log \
+    append-vector-to-feats scp:$sdata/JOB/feats.scp ark:$dir/ivectors_utt.JOB.ark ark:- \| \
+    select-feats "$start_dim-$end_dim" ark:- ark:- \| \
+    subsample-feats --n=$ivector_period ark:- ark:- \| \
+    copy-feats --compress=$compress ark:- \
+    ark,scp:$absdir/ivector_online.JOB.ark,$absdir/ivector_online.JOB.scp || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: combining iVectors across jobs"
+  for j in $(seq $nj); do cat $dir/ivector_online.$j.scp; done >$dir/ivector_online.scp || exit 1;
+fi
+
+steps/nnet2/get_ivector_id.sh $srcdir > $dir/final.ie.id || exit 1
+
+echo "$0: done extracting (pseudo-online) iVectors to $dir using the extractor in $srcdir."
+
diff --git a/egs/chime6/s5b_track2/local/ts-vad/make_json_align.py b/egs/chime6/s5b_track2/local/ts-vad/make_json_align.py
new file mode 100755
index 00000000000..1d13a85a425
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/ts-vad/make_json_align.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+
+# Copyright  2020  Yuri Khokhlov, Ivan Medennikov (STC-innovations Ltd)
+# Apache 2.0.
+
+"""This script converts JSON to VAD alignment.
+Single-speaker speech is treated as 1, whereas both silence and overlapped speech as 0.
+This can be used for excluding overlapping regions from i-vectors estimation"""
+
+import os
+import json
+import datetime
+import argparse
+import numpy as np
+from pathlib import Path
+from kaldiio import WriteHelper
+
+def time_to_seconds(time):
+    parts = time.split(':')
+    return datetime.timedelta(hours=float(parts[0]), minutes=float(parts[1]), seconds=float(parts[2])).total_seconds()
+
+class Segment:
+    def __init__(self, begin, end, label):
+        self.begin = begin
+        self.end = end
+        self.label = label
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Usage: make_json_align.py <json-path> <ali-wspec>')
+    parser.add_argument("--frequency", "-f", type=int, default=16000)
+    parser.add_argument("--frame_len", "-l", type=float, default=0.025)
+    parser.add_argument("--frame_shift", "-s", type=float, default=0.010)
+    parser.add_argument('json_path', type=str)
+    parser.add_argument('ali_wspec', type=str)
+    args = parser.parse_args()
+
+    frame_len = int(round(args.frame_len * args.frequency))
+    frame_shift = int(round(args.frame_shift * args.frequency))
+
+    print('Options:')
+    print('  Sampling frequency:  {}'.format(args.frequency))
+    print('  Frame length in sec: {}  ({})'.format(args.frame_len, frame_len))
+    print('  Frame shift in sec:  {}  ({})'.format(args.frame_shift, frame_shift))
+    print('  Path to the source JSON:   {}'.format(args.json_path))
+    print('  Alignment write specifier: {}'.format(args.ali_wspec))
+
+    json_path = Path(args.json_path)
+    assert os.path.isfile(args.json_path), 'File does not exist {}'.format(args.json_path)
+
+    print('Loading file {}'.format(json_path))
+    with open(str(json_path)) as stream:
+        data = json.load(stream)
+        stream.close()
+    print('  loaded {} segments'.format(len(data)))
+
+    print('Building alignment')
+    duration = 0.0
+    for reco in data:
+        duration = max(time_to_seconds(reco['end_time']), duration)
+    print('  session duration {:.2f} hrs'.format(duration / 60 / 60))
+    total_frames = (int(duration * args.frequency) - frame_len) // frame_shift + 1
+    print('  total number of frames {}'.format(total_frames))
+    alignment = np.zeros(total_frames, dtype=np.int32)
+    for reco in data:
+        start_time = time_to_seconds(reco['start_time'])
+        end_time = time_to_seconds(reco['end_time'])
+        num_frames = (int((end_time - start_time) * args.frequency) - frame_len) // frame_shift + 1
+        start_frame = int(start_time * args.frequency) // frame_shift
+        end_frame = min(start_frame + num_frames, total_frames)
+        alignment[start_frame: end_frame] += 1
+    value = args.frame_shift * np.count_nonzero(alignment == 0)
+    print('  out of segments: {:.2f} hrs'.format(value / 60 / 60))
+    value = args.frame_shift * np.count_nonzero(alignment == 1)
+    print('  single speaker: {:.2f} hrs'.format(value / 60 / 60))
+    value = args.frame_shift * np.count_nonzero(alignment > 1)
+    print('  overlapped speech: {:.2f} hrs'.format(value / 60 / 60))
+
+    alignment = np.vectorize(lambda nspk: 0 if nspk > 1 else nspk)(alignment)
+
+    print('Writing alignment to {}'.format(args.ali_wspec))
+    with WriteHelper(args.ali_wspec) as writer:
+        writer(data[0]['session_id'], alignment)
+        writer.close()
diff --git a/egs/chime6/s5b_track2/local/ts-vad/make_negative_utt2spk.pl b/egs/chime6/s5b_track2/local/ts-vad/make_negative_utt2spk.pl
new file mode 100755
index 00000000000..a8128551d5c
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/ts-vad/make_negative_utt2spk.pl
@@ -0,0 +1,155 @@
+#!/usr/bin/perl
+# Copyright     2020  Ivan Medennikov (STC-innovations Ltd)
+# Apache 2.0.
+
+# This script creates 3 negative utt2spk files with speakers from the same session.
+
+($filein,$fileout,$fileout2,$fileout3)=@ARGV;
+
+$Nspk=4;
+
+%id2time2utt={};
+%utt2spk={};
+%utt2P={};
+%sid2spk={};
+
+open(fidin, "<$filein") or die "can't open $filein : $!";
+while ($line=<fidin>)
+{
+  $line=~s/\s+$//;
+  @items=split(/\s+/,$line);
+  $utt=$items[0];
+  $spk=$items[1];
+  $utt2spk{$utt}=$spk;
+  if ($utt=~/P(\d+)/)
+  {
+    $P=$1;
+  }
+  else
+  {
+    print "skipping utt $utt\n";
+    next;
+  }
+  $utt2P{$utt}=$P;
+  if ($utt=~/S(\d+)/)
+  {
+    $S=$1;
+  }
+  else
+  {
+    print "skipping utt $utt\n";
+    next;
+  }
+  if ($utt=~/\D(\d{7})-\d{7}/)
+  {
+    $beg=$1;
+  }
+  else
+  {
+    print "skipping utt $utt\n";
+    next;
+  }
+  $type=0;
+  if ($utt=~/rev/)
+  {
+    $type=1;
+  }
+  elsif ($utt=~/\.L/)
+  {
+    $type=L;
+  }
+  elsif ($utt=~/\.R/)
+  {
+    $type=R;
+  }
+  elsif ($utt=~/(U\d+).+(CH\d+)/)
+  {
+    $type="$1_$2";
+  }
+  if ($utt=~/(sp0.9)/)
+  {
+    $type="$1_$type";
+  }
+  if ($utt=~/(sp1.1)/)
+  {
+    $type="$1_$type";
+  }
+  $id="S$S\_$type";
+  if ( not exists $id2time2utt{$id} )
+  {
+    %{$id2time2utt{$id}}={};
+  }
+  push(@{$id2time2utt{$id}{$beg}},$utt);
+  $sid="$P\_$type";
+  push(@{$sid2spk{$sid}},$spk);
+}
+close(fidin);
+
+
+open(fidout, ">$fileout") or die "can't open $fileout : $!";
+open(fidout2, ">$fileout2") or die "can't open $fileout2 : $!";
+open(fidout3, ">$fileout3") or die "can't open $fileout3 : $!";
+foreach $id(sort keys %{id2time2utt})
+{
+  $type="";
+  if ($id=~/S\d+\_(\S+)/)
+  {
+    $type=$1;
+  }
+  @utts=();
+  %curspk={};
+  foreach $time(sort keys %{$id2time2utt{$id}})
+  {
+    foreach $utt (@{$id2time2utt{$id}{$time}})
+    {
+    $P=$utt2P{$utt};
+    if ($utt=~/^\s*$/) { next; }
+    if (not exists $curspk{$P}) { $curspk{$P}=$utt2spk{$utt}; }
+    push(@utts,$utt);
+    }
+  }
+  foreach $utt (@utts)
+  {
+    $P=$utt2P{$utt};
+    $curspk{$P}=$utt2spk{$utt};
+    $Plast=int(($P-1)/$Nspk)*$Nspk+$Nspk;
+    $P1=$P+1;
+    if ($P1 > $Plast)
+    {
+      $P1=$P1-$Nspk;
+    }
+    if ($P1<10)
+    {
+      $P1="0$P1";
+    }
+    $P2=$P+2;
+    if ($P2 > $Plast)
+    {
+      $P2=$P2-$Nspk;
+    }
+    if ($P2<10)
+    {
+      $P2="0$P2";
+    }
+    $P3=$P+3;
+    if ($P3 > $Plast)
+    {
+      $P3=$P3-$Nspk;
+    }
+    if ($P3<10)
+    {
+      $P3="0$P3";
+    }
+    if ( not exists $curspk{$P1} ) { $sid=$P1."\_".$type; $cspk=$sid2spk{$sid}[rand @{$sid2spk{$sid}}]; print fidout "$utt $cspk\n";  }
+    else { print fidout "$utt $curspk{$P1}\n"; }
+    if ( not exists $curspk{$P2} ) { $sid=$P2."\_".$type; $cspk=$sid2spk{$sid}[rand @{$sid2spk{$sid}}]; print fidout2 "$utt $cspk\n"; }
+    else { print fidout2 "$utt $curspk{$P2}\n"; }
+    if ( not exists $curspk{$P3} ) { $sid=$P3."\_".$type; $cspk=$sid2spk{$sid}[rand @{$sid2spk{$sid}}]; print fidout3 "$utt $cspk\n"; }
+    else { print fidout3 "$utt $curspk{$P3}\n"; }
+  }
+}
+
+close(fidout);
+close(fidout2);
+close(fidout3);
+exit 0;
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/ts-vad/make_utt2uniq.pl b/egs/chime6/s5b_track2/local/ts-vad/make_utt2uniq.pl
new file mode 100755
index 00000000000..94b55a6f598
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/ts-vad/make_utt2uniq.pl
@@ -0,0 +1,51 @@
+#!/usr/bin/perl
+# Copyright     2020  Ivan Medennikov (STC-innovations Ltd)
+# Apache 2.0.
+
+# This script creates utt2uniq file for the CHiME-6 utterances.
+
+($filein,$fileout)=@ARGV;
+
+open(fidout, ">$fileout") or die "can't open $fileout : $!";
+open(fidin, "<$filein") or die "can't open $filein : $!";
+while ($line=<fidin>)
+{
+  $line=~s/\s+$//;
+  @items=split(/\s+/,$line);
+  $utt=$items[0];
+  $spk=$items[1];
+  if ($utt=~/P(\d+)/)
+  {
+    $P=$1;
+  }
+  else
+  {
+    print "skipping utt $utt\n";
+    next;
+  }
+  if ($utt=~/S(\d+)/)
+  {
+    $S=$1;
+  }
+  else
+  {
+    print "skipping utt $utt\n";
+    next;
+  }
+  if ($utt=~/\D(\d{7})-(\d{7})/)
+  {
+    $beg=$1;
+    $end=$2;
+  }
+  else
+  {
+    print "skipping utt $utt\n";
+    next;
+  }
+  $id="P$P\_S$S\_$beg\-$end";
+  print fidout "$utt $id\n";
+}
+close(fidin);
+close(fidout);
+
+exit 0;
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/ts-vad/modify_ups_utt2spk.pl b/egs/chime6/s5b_track2/local/ts-vad/modify_ups_utt2spk.pl
new file mode 100755
index 00000000000..bc53308e021
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/ts-vad/modify_ups_utt2spk.pl
@@ -0,0 +1,45 @@
+#!/usr/bin/perl
+# Copyright     2020  Ivan Medennikov (STC-innovations Ltd)
+# Apache 2.0.
+
+# This script splits speakers in utt2spk file, 
+# leaving $ups utterances for each sub-speaker.
+
+($filein,$ups,$fileout)=@ARGV;
+
+open(fidin, "<$filein") or die "can't open $filein : $!";
+open(fidout, ">$fileout") or die "can't open $fileout : $!";
+%utt2spk={};
+%spk2utt={};
+while ($line=<fidin>)
+{
+  $line=~s/\s+$//;
+  @items=split(/\s+/,$line);
+  $utt=$items[0];
+  $spk=$items[1];
+  push (@{$spk2utt{$spk}},$utt);
+}
+close(fidin);
+
+foreach $spk (sort keys %{spk2utt})
+{
+  $i=0;
+  $num=scalar @{$spk2utt{$spk}};
+  foreach $utt (sort @{$spk2utt{$spk}})
+  {
+     $sid=1+int($i/$ups);
+     if ($ups*$sid > $num)
+     { 
+       $sid-=1;
+     }
+     if ($sid < 10)
+     {
+       $sid="0$sid";
+     }
+    print fidout "$utt $spk-$sid\n";
+    $i+=1;
+  }
+}
+close(fidin);
+close(fidout);
+exit 0;
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/ts-vad/prepare_json_weights.pl b/egs/chime6/s5b_track2/local/ts-vad/prepare_json_weights.pl
new file mode 100755
index 00000000000..f58e28fcc96
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/ts-vad/prepare_json_weights.pl
@@ -0,0 +1,57 @@
+#!/usr/bin/perl
+# Copyright     2020  Ivan Medennikov (STC-innovations Ltd)
+# Apache 2.0.
+
+# This script creates per-utterance json alignment by per-session alignment and segments.
+
+($segments,$jsonali_scp,$jsonali_scp_perutt)=@ARGV;
+
+%ark={};
+
+open(fidin, "<$jsonali_scp") or die "can't open $jsonali_scp : $!";
+while ($line=<fidin>)
+{
+  $line=~s/\s+$//;
+  @items=split(/\s+/,$line);
+  $ark{$items[0]}=$items[1];
+  print "$items[0] $ark{$items[0]}\n";
+}
+close(fidin);
+
+open(fidin, "<$segments") or die "can't open $segments : $!";
+open(fidout, ">$jsonali_scp_perutt") or die "can't open $jsonali_scp_perutt : $!";
+while ($line=<fidin>)
+{
+  $line=~s/\s+$//;
+  @items=split(/\s+/,$line);
+  $utt=$items[0];
+  $wav=$items[1];
+  $beg=$items[2];
+  $end=$items[3];
+  if ($utt=~/_(S\d+).*(\d{7})-(\d{7})/)
+  {
+    $sess=$1;
+    $ubeg=$2;
+    $ubeg=~s/^0+//;
+    if ($utt=~/sp(\d+\.\d+)/)
+    {
+      $sp=$1;
+      $ubeg=int($ubeg/$sp+0.5);
+      $sess=$sess."_sp$sp";
+    }
+    if (($utt=~/^$wav\-\d+$/) || ($utt=~/^$wav$/))
+    {
+      $beg=$ubeg+int($beg*100+0.5);
+      $end=$ubeg+int($end*100+0.5)-1;
+    }
+    else
+    {
+      $beg=int($beg*100+0.5);
+      $end=int($end*100+0.5)-1;
+    }
+    print fidout "$utt $ark{$sess}\[$beg\:$end\]\n";
+  }
+}
+close(fidin);
+close(fidout);
+exit 0;
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/ts-vad/shuffle_4spk_scp_utt2spk.pl b/egs/chime6/s5b_track2/local/ts-vad/shuffle_4spk_scp_utt2spk.pl
new file mode 100755
index 00000000000..6821609ab8e
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/ts-vad/shuffle_4spk_scp_utt2spk.pl
@@ -0,0 +1,151 @@
+#!/usr/bin/perl
+# Copyright     2020  Ivan Medennikov (STC-innovations Ltd)
+# Apache 2.0.
+
+# This script takes 4 scp files $filein{1,2,3,4} with the same utterance-ids, 
+# and produces 4 shuffled versions of them.
+# Moreover, the same shuffling is performed with 4 utt2spk files $utt2spk{1,2,3,4}.
+
+use List::Util qw(shuffle);
+
+($filein1,$filein2,$filein3,$filein4,$fileout1,$fileout2,$fileout3,$fileout4,$utt2spk1,$utt2spk2,$utt2spk3,$utt2spk4,$out1,$out2,$out3,$out4)=@ARGV;
+
+%utt2arks={};
+%utt2spk1={};
+open(fidin, "<$utt2spk1") or die "can't open $utt2spk1 : $!";
+while ($line=<fidin>)
+{
+  $line=~s/\s+$//;
+  @items=split(/\s+/,$line);
+  $utt=$items[0];
+  $spk=$items[1];
+  $utt2spk1{$utt}=$spk;
+}
+close(fidin);
+
+%utt2spk2={};
+open(fidin, "<$utt2spk2") or die "can't open $utt2spk2 : $!";
+while ($line=<fidin>)
+{
+  $line=~s/\s+$//;
+  @items=split(/\s+/,$line);
+  $utt=$items[0];
+  $spk=$items[1];
+  $utt2spk2{$utt}=$spk;
+}
+close(fidin);
+
+%utt2spk3={};
+open(fidin, "<$utt2spk3") or die "can't open $utt2spk3 : $!";
+while ($line=<fidin>)
+{
+  $line=~s/\s+$//;
+  @items=split(/\s+/,$line);
+  $utt=$items[0];
+  $spk=$items[1];
+  $utt2spk3{$utt}=$spk;
+}
+close(fidin);
+
+%utt2spk4={};
+open(fidin, "<$utt2spk4") or die "can't open $utt2spk4 : $!";
+while ($line=<fidin>)
+{
+  $line=~s/\s+$//;
+  @items=split(/\s+/,$line);
+  $utt=$items[0];
+  $spk=$items[1];
+  $utt2spk4{$utt}=$spk;
+}
+close(fidin);
+
+
+open(fidin, "<$filein1") or die "can't open $filein1 : $!";
+while ($line=<fidin>)
+{
+  $line=~s/\s+$//;
+  @items=split(/\s+/,$line, 2);
+  $utt=$items[0];
+  $ark=$items[1];
+  push(@{$utt2arks{$utt}},"$utt2spk1{$utt} $ark");
+}
+close(fidin);
+
+open(fidin, "<$filein2") or die "can't open $filein2 : $!";
+while ($line=<fidin>)
+{
+  $line=~s/\s+$//;
+  @items=split(/\s+/,$line, 2);
+  $utt=$items[0];
+  $ark=$items[1];
+  push(@{$utt2arks{$utt}},"$utt2spk2{$utt} $ark");
+}
+close(fidin);
+
+open(fidin, "<$filein3") or die "can't open $filein3 : $!";
+while ($line=<fidin>)
+{
+  $line=~s/\s+$//;
+  @items=split(/\s+/,$line, 2);
+  $utt=$items[0];
+  $ark=$items[1];
+  push(@{$utt2arks{$utt}},"$utt2spk3{$utt} $ark");
+}
+close(fidin);
+
+open(fidin, "<$filein4") or die "can't open $filein4 : $!";
+while ($line=<fidin>)
+{
+  $line=~s/\s+$//;
+  @items=split(/\s+/,$line, 2);
+  $utt=$items[0];
+  $ark=$items[1];
+  push(@{$utt2arks{$utt}},"$utt2spk4{$utt} $ark");
+}
+close(fidin);
+
+open(fidout1, ">$fileout1") or die "can't open $fileout1 : $!";
+open(fidout2, ">$fileout2") or die "can't open $fileout2 : $!";
+open(fidout3, ">$fileout3") or die "can't open $fileout3 : $!";
+open(fidout4, ">$fileout4") or die "can't open $fileout4 : $!";
+
+open(out1, ">$out1") or die "can't open $out1 : $!";
+open(out2, ">$out2") or die "can't open $out2 : $!";
+open(out3, ">$out3") or die "can't open $out3 : $!";
+open(out4, ">$out4") or die "can't open $out4 : $!";
+
+foreach $utt(sort(keys %utt2arks))
+{
+  if (scalar(@{$utt2arks{$utt}}) < 4)
+  {
+    next;
+  }
+  @shf = shuffle(@{$utt2arks{$utt}});
+  @u1 = split(/\s+/, $shf[0], 2);
+  @u2 = split(/\s+/, $shf[1], 2);
+  @u3 = split(/\s+/, $shf[2], 2);
+  @u4 = split(/\s+/, $shf[3], 2);
+
+  print fidout1 "$utt $u1[1]\n";
+  print fidout2 "$utt $u2[1]\n"; 
+  print fidout3 "$utt $u3[1]\n";
+  print fidout4 "$utt $u4[1]\n";
+
+  print out1 "$utt $u1[0]\n";
+  print out2 "$utt $u2[0]\n";
+  print out3 "$utt $u3[0]\n";
+  print out4 "$utt $u4[0]\n";
+
+}
+close(fidout1);
+close(fidout2);
+close(fidout3);
+close(fidout4);
+
+close(out1);
+close(out2);
+close(out3);
+close(out4);
+
+exit 0;
+
diff --git a/egs/chime6/s5b_track2/local/ts-vad/split_feats_seg.pl b/egs/chime6/s5b_track2/local/ts-vad/split_feats_seg.pl
new file mode 100755
index 00000000000..cdcf472bfc1
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/ts-vad/split_feats_seg.pl
@@ -0,0 +1,73 @@
+#!/usr/bin/perl
+# Copyright     2020  Ivan Medennikov (STC-innovations Ltd)
+# Apache 2.0.
+
+($filein,$utt2spk,$utt2dur,$chunk,$fileout,$fileout2,$fileout3)=@ARGV;
+
+%utt2dur={};
+open(fidin, "<$utt2dur") or die "can't open $utt2dur : $!";
+while ($line=<fidin>)
+{
+  $line=~s/\s+$//; 
+  @items=split(/\s+/,$line);
+  $utt2dur{$items[0]}=$items[1];
+}
+close(fidin);
+
+%utt2spk={};
+open(fidin, "<$utt2spk") or die "can't open $utt2spk : $!";
+while ($line=<fidin>)
+{
+  $line=~s/\s+$//;
+  @items=split(/\s+/,$line);
+  $utt2spk{$items[0]}=$items[1];
+}
+close(fidin);
+
+
+open(fidin, "<$filein") or die "can't open $filein : $!";
+open(fidout, ">$fileout") or die "can't open $fileout : $!";
+open(fidout2, ">$fileout2") or die "can't open $fileout2 : $!";
+open(fidout3, ">$fileout3") or die "can't open $fileout3 : $!";
+
+while ($line=<fidin>)
+{
+  $line=~s/\s+$//;
+  @items=split(/\s+/,$line);
+  $begin=0;
+  $end=$begin+$chunk-1;
+  $id=1;
+  $suffix=$id;
+  while ($begin < $utt2dur{$items[0]})
+  {
+    $end=$begin+$chunk-1;
+    if ($end > $utt2dur{$items[0]}-1)
+    {
+        $end = $utt2dur{$items[0]}-1;
+    }
+    if ($id < 1000)
+    {
+        $suffix="0$id";
+    }
+    if ($id < 100)
+    {
+        $suffix="00$id";
+    }
+    if ($id < 10)
+    {
+        $suffix="000$id";
+    }
+    print fidout "$items[0]-$suffix $items[1]\[$begin:$end\]\n";
+    print fidout2 "$items[0]-$suffix $utt2spk{$items[0]}\n";
+    $begin_sec=$begin/100.0;
+    $end_sec=$end/100.0;
+    print fidout3 "$items[0]-$suffix $items[0] $begin_sec $end_sec\n";
+    $begin=$begin+$chunk;
+    $id=$id+1;
+  }
+}
+close(fidin);
+close(fidout);
+close(fidout2);
+close(fidout3);
+exit 0;
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/local/ts-vad/vad_prob_mod.py b/egs/chime6/s5b_track2/local/ts-vad/vad_prob_mod.py
new file mode 100644
index 00000000000..a6b88833e6f
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/ts-vad/vad_prob_mod.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+
+# Copyright  2020   Ivan Medennikov (STC-innovations Ltd)
+# Apache 2.0.
+
+"""This script modifies TS-VAD output probabilities applying 
+absolute threshold (--threshold) and relative threshold (--multispk_threshold) for pi/(p1+p2+p3+p4)  
+(to exclude overlapping regions from i-vectors estimation)"""
+
+import os
+import argparse
+import regex as re
+import numpy as np
+from scipy import signal, ndimage
+from kaldiio import ReadHelper, WriteHelper
+
+class WeightsSet:
+    def __init__(self, vad_rspec, reg_exp):
+        data = dict()
+        prev = -1
+        with ReadHelper(vad_rspec) as reader:
+            for utid, align in reader:
+                result = reg_exp.match(utid)
+                assert result is not None, 'Wrong VAD alignment utterance ID format: \"{}\"'.format(utid)
+                sess = result.group(1)
+                piece = result.group(2)
+                spkr = result.group(3)
+                if sess not in data.keys():
+                    data[sess] = dict()
+                if piece not in data[sess].keys():
+                    data[sess][piece] = dict()
+                data[sess][piece][spkr]=align
+            reader.close()
+        print('  loaded {} sessions'.format(len(data)))
+        self.data = data
+
+    def modify_prob(self, threshold, multispk_threshold, lowest_value):
+        for sess in self.data.keys():
+            for piece in self.data[sess].keys():
+                maxlen=0
+                longest=""
+                for spkr in self.data[sess][piece].keys():
+                    if (len(self.data[sess][piece][spkr]) > maxlen):
+                        maxlen=len(self.data[sess][piece][spkr])
+                        longest=spkr
+                sumprob=self.data[sess][piece][longest].copy()
+                for spkr in self.data[sess][piece].keys():
+                    if spkr == longest:
+                        continue
+                    for i in range(len(self.data[sess][piece][spkr])):
+                        sumprob[i]+=self.data[sess][piece][spkr][i]
+                for spkr in self.data[sess][piece].keys():
+                    for i in range(len(self.data[sess][piece][spkr])):
+                        if (self.data[sess][piece][spkr][i] < threshold):
+                            self.data[sess][piece][spkr][i]=lowest_value
+                for spkr in self.data[sess][piece].keys():
+                    for i in range(len(self.data[sess][piece][spkr])):
+                        if (self.data[sess][piece][spkr][i]/sumprob[i] < multispk_threshold):
+                            self.data[sess][piece][spkr][i]=lowest_value
+
+
+    def write(self, vad_wspec):
+        with WriteHelper(vad_wspec) as writer:
+            for sess in self.data.keys():
+                for piece in self.data[sess].keys():
+                    for spkr in self.data[sess][piece].keys():
+                        utt=sess+'-'+piece+'-'+spkr
+                        writer(utt, self.data[sess][piece][spkr])
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Usage: vad_prob_mod.py <vad-rspec> <vad-wspec>')
+    parser.add_argument("--reg_exp", "-x", type=str, default=r'^(S\d\d.*)\-(\d+)\-(\d)$')
+    parser.add_argument("--threshold", "-t", type=float, default=0.0)
+    parser.add_argument("--multispk_threshold", "-mt", type=float, default=0.8)
+    parser.add_argument("--lowest_value", "-l", type=float, default=0.00001)
+    parser.add_argument('vad_rspec', type=str)
+    parser.add_argument('vad_wspec', type=str)
+    args = parser.parse_args()
+
+    print('Options:')
+    print('  Utterance ID regexp: {}'.format(args.reg_exp))
+    print('  Absolute threshold:     {}'.format(args.threshold))
+    print('  Multispeaker threshold for Pi/(P1+P2+P3+P4):     {}'.format(args.multispk_threshold))
+    print('  Lowest value which is used when applying the thresholds:    {}'.format(args.lowest_value))
+    print('  VAD rspec:    {}'.format(args.vad_rspec))
+    print('  VAD wspec:    {}'.format(args.vad_wspec))
+
+    reg_exp = re.compile(args.reg_exp)
+
+    print('Loading VAD probabilities')
+    vad_align = WeightsSet(args.vad_rspec, reg_exp)
+
+    print('Modifying VAD probabilities')
+    vad_align.modify_prob(args.threshold, args.multispk_threshold, args.lowest_value)
+
+    print('Writing VAD probabilities')
+    vad_align.write(args.vad_wspec)
diff --git a/egs/chime6/s5b_track2/local/uem_file b/egs/chime6/s5b_track2/local/uem_file
new file mode 100644
index 00000000000..c1d4dbcd5d4
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/uem_file
@@ -0,0 +1,20 @@
+S01_U01 1 0 12000
+S02_U01 1 75 12000
+S09_U01 1 64 12000
+S21_U01 1 59 12000
+S01_U02 1 0 12000
+S02_U02 1 75 12000
+S09_U02 1 64 12000
+S21_U02 1 59 12000
+S01_U03 1 0 12000
+S02_U03 1 75 12000
+S09_U03 1 64 12000
+S21_U03 1 59 12000
+S01_U04 1 0 12000
+S02_U04 1 75 12000
+S09_U04 1 64 12000
+S21_U04 1 59 12000
+S01_U06 1 0 12000
+S02_U06 1 75 12000
+S09_U06 1 64 12000
+S21_U06 1 59 12000
diff --git a/egs/chime6/s5b_track2/local/wer_output_filter b/egs/chime6/s5b_track2/local/wer_output_filter
new file mode 120000
index 00000000000..12a6c616d3d
--- /dev/null
+++ b/egs/chime6/s5b_track2/local/wer_output_filter
@@ -0,0 +1 @@
+../../s5_track1/local/wer_output_filter
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/path.sh b/egs/chime6/s5b_track2/path.sh
new file mode 100644
index 00000000000..2f4e4e4fb21
--- /dev/null
+++ b/egs/chime6/s5b_track2/path.sh
@@ -0,0 +1,9 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH
+export PATH=$PWD/dscore:$PATH
+export PYTHONPATH="${PYTHONPATH}:$PWD/dscore"
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
diff --git a/egs/chime6/s5b_track2/run.sh b/egs/chime6/s5b_track2/run.sh
new file mode 100755
index 00000000000..7a271a787e4
--- /dev/null
+++ b/egs/chime6/s5b_track2/run.sh
@@ -0,0 +1,322 @@
+#!/usr/bin/env bash
+#
+# Chime-6 Track 2 baseline. Based mostly on the Chime-5 recipe, with the exception
+# that we are required to perform speech activity detection and speaker
+# diarization before ASR, since we do not have access to the oracle SAD and 
+# diarization labels.
+#
+# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal)
+#            2019  Desh Raj, David Snyder, Ashish Arora
+# Apache 2.0
+
+# Begin configuration section.
+nj=50
+decode_nj=20
+stage=0
+nnet_stage=-10
+sad_stage=0
+diarizer_stage=0
+decode_stage=0
+ts_vad_stage=0
+enhancement=beamformit # for a new enhancement method,
+                       # change this variable and decode stage
+decode_only=false
+num_data_reps=4
+snrs="20:10:15:5:0"
+foreground_snrs="20:10:15:5:0"
+background_snrs="20:10:15:5:0"
+
+# End configuration section
+. ./utils/parse_options.sh
+
+. ./cmd.sh
+. ./path.sh
+
+if [ $decode_only == "true" ]; then
+  stage=19
+fi
+
+set -e # exit on error
+
+# chime5 main directory path
+# please change the path accordingly
+chime5_corpus=/export/corpora4/CHiME5
+# chime6 data directories, which are generated from ${chime5_corpus},
+# to synchronize audio files across arrays and modify the annotation (JSON) file accordingly
+chime6_corpus=${PWD}/CHiME6
+json_dir=${chime6_corpus}/transcriptions
+audio_dir=${chime6_corpus}/audio
+
+# training and test data
+train_set=train_worn_simu_u400k
+sad_train_set=train_worn_u400k
+test_sets="dev_${enhancement}_dereverb eval_${enhancement}_dereverb"
+
+# TS-VAD options
+ts_vad_dir=exp/ts-vad_1a
+ivector_dir=exp/nnet3_${train_set}_cleaned_rvb
+
+# This script also needs the phonetisaurus g2p, srilm, beamformit
+./local/check_tools.sh || exit 1;
+
+###########################################################################
+# We first generate the synchronized audio files across arrays and
+# corresponding JSON files. Note that this requires sox v14.4.2,
+# which is installed via miniconda in ./local/check_tools.sh
+###########################################################################
+
+if [ $stage -le 0 ]; then
+  local/generate_chime6_data.sh \
+    --cmd "$train_cmd" \
+    ${chime5_corpus} \
+    ${chime6_corpus}
+fi
+
+###########################################################################
+# We prepare dict and lang in stages 1 to 3.
+###########################################################################
+
+if [ $stage -le 1 ]; then
+  # skip u03 and u04 as they are missing
+  for mictype in worn u01 u02 u05 u06; do
+    local/prepare_data.sh --mictype ${mictype} --train true \
+        ${audio_dir}/train ${json_dir}/train data/train_${mictype}
+  done
+  for dataset in dev; do
+    for mictype in worn; do
+      local/prepare_data.sh --mictype ${mictype} --train true \
+          ${audio_dir}/${dataset} ${json_dir}/${dataset} \
+          data/${dataset}_${mictype}
+    done
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  local/prepare_dict.sh
+
+  utils/prepare_lang.sh \
+    data/local/dict "<unk>" data/local/lang data/lang
+
+  local/train_lms_srilm.sh \
+    --train-text data/train_worn/text --dev-text data/dev_worn/text \
+    --oov-symbol "<unk>" --words-file data/lang/words.txt \
+    data/ data/srilm
+fi
+
+LM=data/srilm/best_3gram.gz
+if [ $stage -le 3 ]; then
+  # Compiles G for chime5 trigram LM
+  utils/format_lm.sh \
+    data/lang $LM data/local/dict/lexicon.txt data/lang
+
+fi
+
+if [ $stage -le 4 ]; then
+  # remove possibly bad sessions (P11_S03, P52_S19, P53_S24, P54_S24)
+  # see http://spandh.dcs.shef.ac.uk/chime_challenge/data.html for more details
+  utils/copy_data_dir.sh data/train_worn data/train_worn_org # back up
+  grep -v -e "^P11_S03" -e "^P52_S19" -e "^P53_S24" -e "^P54_S24" data/train_worn_org/text > data/train_worn/text
+  utils/fix_data_dir.sh data/train_worn
+
+  # Remove S12_U05 from training data since it has known issues
+  utils/copy_data_dir.sh data/train_u05 data/train_u05_org # back up
+  grep -v -e "^S12_U05" data/train_u05_org/text > data/train_u05/text
+  utils/fix_data_dir.sh data/train_u05
+fi
+
+#########################################################################################
+# In stages 5 and 6, we augment and fix train data for our training purpose. point source
+# noises are extracted from chime corpus. Here we use 400k utterances from array microphones,
+# its augmentation and all the worn set utterances in train.
+#########################################################################################
+
+if [ $stage -le 5 ]; then
+  echo "$0: Extracting noise list from training data"
+  local/extract_noises.py $chime6_corpus/audio/train $chime6_corpus/transcriptions/train \
+    local/distant_audio_list distant_noises
+  local/make_noise_list.py distant_noises > distant_noise_list
+
+  noise_list=distant_noise_list
+  
+  echo "$0: Preparing simulated RIRs for data augmentation"
+  if [ ! -d RIRS_NOISES/ ]; then
+    # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+    unzip rirs_noises.zip
+  fi
+
+  # This is the config for the system using simulated RIRs and point-source noises
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
+  rvb_opts+=(--noise-set-parameters $noise_list)
+
+  steps/data/reverberate_data_dir.py \
+    "${rvb_opts[@]}" \
+    --prefix "rev" \
+    --foreground-snrs $foreground_snrs \
+    --background-snrs $background_snrs \
+    --speech-rvb-probability 1 \
+    --pointsource-noise-addition-probability 1 \
+    --isotropic-noise-addition-probability 1 \
+    --num-replications $num_data_reps \
+    --max-noises-per-minute 1 \
+    --source-sampling-rate 16000 \
+    data/train_worn data/train_worn_rvb
+fi
+
+if [ $stage -le 6 ]; then
+  # combine mix array and worn mics
+  # randomly extract first 400k utterances from all mics
+  # if you want to include more training data, you can increase the number of array mic utterances
+  utils/combine_data.sh data/train_uall data/train_u01 data/train_u02 data/train_u05 data/train_u06
+  utils/subset_data_dir.sh data/train_uall 400000 data/train_u400k
+  utils/combine_data.sh data/${train_set} data/train_worn data/train_worn_rvb data/train_u400k
+  utils/combine_data.sh data/${sad_train_set} data/train_worn data/train_u400k
+fi
+
+if [ $stage -le 7 ]; then
+  # Split speakers up into 3-minute chunks.  This doesn't hurt adaptation, and
+  # lets us use more jobs for decoding etc.
+  utils/copy_data_dir.sh data/${train_set} data/${train_set}_nosplit
+  utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${train_set}_nosplit data/${train_set}
+fi
+
+##################################################################################
+# Now make MFCC features. We use 13-dim MFCCs to train the GMM-HMM models.
+##################################################################################
+
+if [ $stage -le 8 ]; then
+  # Now make MFCC features.
+  # mfccdir should be some place with a largish disk where you
+  # want to store MFCC features.
+  echo "$0:  make features..."
+  mfccdir=mfcc
+  steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \
+             --mfcc-config conf/mfcc.conf \
+             data/${train_set} exp/make_mfcc/${train_set} $mfccdir
+  steps/compute_cmvn_stats.sh data/${train_set} exp/make_mfcc/${train_set} $mfccdir
+  utils/fix_data_dir.sh data/${train_set}
+fi
+
+###################################################################################
+# Stages 9 to 14 train monophone and triphone models. They will be used for 
+# generating lattices for training the chain model and for obtaining targets
+# for training the SAD system.
+###################################################################################
+
+if [ $stage -le 9 ]; then
+  # make a subset for monophone training
+  utils/subset_data_dir.sh --shortest data/${train_set} 100000 data/${train_set}_100kshort
+  utils/subset_data_dir.sh data/${train_set}_100kshort 30000 data/${train_set}_30kshort
+fi
+
+if [ $stage -le 10 ]; then
+  # Starting basic training on MFCC features
+  steps/train_mono.sh --nj $nj --cmd "$train_cmd" \
+          data/${train_set}_30kshort data/lang exp/mono
+fi
+
+if [ $stage -le 11 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+        data/${train_set} data/lang exp/mono exp/mono_ali
+
+  steps/train_deltas.sh --cmd "$train_cmd" \
+      2500 30000 data/${train_set} data/lang exp/mono_ali exp/tri1
+fi
+
+if [ $stage -le 12 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+        data/${train_set} data/lang exp/tri1 exp/tri1_ali
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+        4000 50000 data/${train_set} data/lang exp/tri1_ali exp/tri2
+fi
+
+if [ $stage -le 13 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+        data/${train_set} data/lang exp/tri2 exp/tri2_ali
+
+  steps/train_sat.sh --cmd "$train_cmd" \
+         5000 100000 data/${train_set} data/lang exp/tri2_ali exp/tri3
+fi
+
+if [ $stage -le 14 ]; then
+  # The following script cleans the data and produces cleaned data
+  steps/cleanup/clean_and_segment_data.sh --nj $nj --cmd "$train_cmd" \
+    --segmentation-opts "--min-segment-length 0.3 --min-new-segment-length 0.6" \
+    data/${train_set} data/lang exp/tri3 exp/tri3_cleaned data/${train_set}_cleaned
+fi
+
+##########################################################################
+# CHAIN MODEL TRAINING
+# You can also download a pretrained chain ASR model using:
+# wget http://kaldi-asr.org/models/12/0012_asr_v1.tar.gz
+# Once it is downloaded, extract using: tar -xvzf 0012_asr_v1.tar.gz
+# and copy the contents of the exp/ directory to your exp/
+##########################################################################
+if [ $stage -le 15 ]; then
+  # chain TDNN
+  local/chain/run_tdnn.sh --nj $nj \
+    --stage $nnet_stage \
+    --train-set ${train_set}_cleaned \
+    --test-sets "$test_sets" \
+    --gmm tri3_cleaned --nnet3-affix _${train_set}_cleaned_rvb
+fi
+
+##########################################################################
+# SAD MODEL TRAINING
+# You can also download a pretrained SAD model using:
+# wget http://kaldi-asr.org/models/12/0012_sad_v1.tar.gz
+# Once it is downloaded, extract using: tar -xvzf 0012_sad_v1.tar.gz
+# and copy the contents of the exp/ directory to your exp/
+##########################################################################
+if [ $stage -le 16 ]; then
+  local/train_sad.sh --stage $sad_stage --nj $nj \
+    --data-dir data/${sad_train_set} --test-sets "${test_sets}" \
+    --sat-model-dir exp/tri3_cleaned \
+    --model-dir exp/tri2
+fi
+
+##########################################################################
+# DIARIZATION MODEL TRAINING
+# You can also download a pretrained diarization model using:
+# wget http://kaldi-asr.org/models/12/0012_diarization_v1.tar.gz
+# Once it is downloaded, extract using: tar -xvzf 0012_diarization_v1.tar.gz
+# and copy the contents of the exp/ directory to your exp/
+##########################################################################
+if [ $stage -le 17 ]; then
+  local/train_diarizer.sh --stage $diarizer_stage \
+    --data-dir data/${train_set} \
+    --model-dir exp/xvector_nnet_1a
+fi
+
+##########################################################################
+# TS-VAD MODEL TRAINING
+# You can also download a pretrained diarization model using:
+# ts_vad_name=ts-vad_1a.tar.gz
+# ts_vad_link=https://github.com/yuri-hohlov/ts-vad-data/raw/master/${ts_vad_name}
+# [ ! -f $ts_vad_name ] && wget -O $ts_vad_name $ts_vad_link
+# [ ! -d $ts_vad_dir ] && tar -zxvf $ts_vad_name -C $(dirname $ts_vad_dir)
+##########################################################################
+if [ $stage -le 18 ]; then
+  local/train_ts-vad.sh --stage $ts_vad_stage \
+    --nnet3-affix _${train_set}_cleaned_rvb \
+    --basedata ${train_set}_cleaned_sp
+fi
+
+##########################################################################
+# DECODING: In track 2, we are given raw utterances without segment
+# or speaker information, so we have to decode the whole pipeline, i.e.,
+# SAD -> Diarization (x-vectors + Spectral Clustering) ->
+# 3 iterations of TS-VAD Diarization -> GSS -> ASR.
+# This is done in the local/decode_ts-vad.sh script.
+##########################################################################
+if [ $stage -le 19 ]; then
+  local/decode_ts-vad.sh --stage $decode_stage \
+    --ts-vad-dir $ts_vad_dir --ivector-dir $ivector_dir \
+    --enhancement $enhancement \
+    --test-sets "$test_sets"
+fi
+
+exit 0;
+
diff --git a/egs/chime6/s5b_track2/sid b/egs/chime6/s5b_track2/sid
new file mode 120000
index 00000000000..893a12f30c9
--- /dev/null
+++ b/egs/chime6/s5b_track2/sid
@@ -0,0 +1 @@
+../../sre08/v1/sid
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/steps b/egs/chime6/s5b_track2/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/chime6/s5b_track2/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/chime6/s5b_track2/utils b/egs/chime6/s5b_track2/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/chime6/s5b_track2/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/src/featbin/Makefile b/src/featbin/Makefile
index 861ba3f7a93..614cc77d80d 100644
--- a/src/featbin/Makefile
+++ b/src/featbin/Makefile
@@ -17,7 +17,7 @@ BINFILES = add-deltas add-deltas-sdc append-post-to-feats \
            process-kaldi-pitch-feats process-pitch-feats \
            select-feats shift-feats splice-feats subsample-feats \
            subset-feats transform-feats wav-copy wav-reverberate \
-           wav-to-duration
+           wav-to-duration multiply-vectors paste-vectors 
 
 OBJFILES =
 
diff --git a/src/featbin/multiply-vectors.cc b/src/featbin/multiply-vectors.cc
new file mode 100644
index 00000000000..70c9e6dcf63
--- /dev/null
+++ b/src/featbin/multiply-vectors.cc
@@ -0,0 +1,173 @@
+// featbin/multiply-vectors.cc
+
+// Copyright 2012 Korbinian Riedhammer
+//           2013 Brno University of Technology (Author: Karel Vesely)
+//           2013 Johns Hopkins University (Author: Daniel Povey)
+//           2020 Ivan Medennikov (STC-innovations Ltd)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "matrix/kaldi-matrix.h"
+
+namespace kaldi {
+
+// returns true if successfully multiplied.
+bool MultiplyVectors(const std::vector<Vector<BaseFloat> > &in,
+                 std::string utt,
+                 int32 tolerance,
+                 Vector<BaseFloat> *out) {
+  // Check the lengths
+  int32 min_len = in[0].Dim(),
+      max_len = in[0].Dim();
+  for (int32 i = 1; i < in.size(); i++) {
+    int32 len = in[i].Dim();
+    if(len < min_len) min_len = len;
+    if(len > max_len) max_len = len;
+  }
+  if (max_len - min_len > tolerance || min_len == 0) {
+    KALDI_WARN << "Length mismatch " << max_len << " vs. " << min_len
+               << (utt.empty() ? "" : " for utt ") << utt
+               << " exceeds tolerance " << tolerance;
+    out->Resize(0);
+    return false;
+  }
+  if (max_len - min_len > 0) {
+    KALDI_VLOG(2) << "Length mismatch " << max_len << " vs. " << min_len
+                  << (utt.empty() ? "" : " for utt ") << utt
+                  << " within tolerance " << tolerance;
+  }
+  out->Resize(min_len);
+  out->Set(1.0);
+  for (int32 i = 0; i < in.size(); i++) {
+    out->MulElements(in[i].Range(0, min_len));
+  }
+  return true;
+}
+
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace std;
+
+    const char *usage =
+        "Multiply vectors frame-by-frame (assuming they have about the same durations, see --length-tolerance);\n"
+        "Usage: multiply-vectors <in-rspecifier1> <in-rspecifier2> [<in-rspecifier3> ...] <out-wspecifier>\n"
+        " or:  multiply-vectors <in-rxfilename1> <in-rxfilename2> [<in-rxfilename3> ...] <out-wxfilename>\n"
+        " e.g. multiply-vectors ark:vec1.ark ark:vec2.ark ark:out.ark\n"
+        " or:  multiply-vectors foo.mat bar.mat baz.mat\n"
+        "See also: paste-feats, copy-vector, append-vector-to-feats\n";
+
+    ParseOptions po(usage);
+
+    int32 length_tolerance = 0;
+    bool binary = true;
+    po.Register("length-tolerance", &length_tolerance,
+                "If length is different, trim as shortest up to a frame "
+                " difference of length-tolerance, otherwise exclude segment.");
+    po.Register("binary", &binary, "If true, output files in binary "
+                "(only relevant for single-file operation, i.e. no tables)");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    if (ClassifyRspecifier(po.GetArg(1), NULL, NULL)
+        != kNoRspecifier) {
+      // We're operating on tables, e.g. archives.
+
+      // Last argument is output
+      string wspecifier = po.GetArg(po.NumArgs());
+      BaseFloatVectorWriter vector_writer(wspecifier);
+
+      // First input is sequential
+      string rspecifier1 = po.GetArg(1);
+      SequentialBaseFloatVectorReader input1(rspecifier1);
+
+      // Assemble vector of other input readers (with random-access)
+      vector<RandomAccessBaseFloatVectorReader *> input;
+      for (int32 i = 2; i < po.NumArgs(); i++) {
+        string rspecifier = po.GetArg(i);
+        RandomAccessBaseFloatVectorReader *rd = new RandomAccessBaseFloatVectorReader(rspecifier);
+        input.push_back(rd);
+      }
+
+      int32 num_done = 0, num_err = 0;
+
+      // Main loop
+      for (; !input1.Done(); input1.Next()) {
+        string utt = input1.Key();
+        KALDI_VLOG(2) << "Multiplying vectors for utterance " << utt;
+
+        // Collect features from streams to vector 'vectors'
+        vector<Vector<BaseFloat> > vectors(po.NumArgs() - 1);
+        vectors[0] = input1.Value();
+        int32 i;
+        for (i = 0; i < static_cast<int32>(input.size()); i++) {
+          if (input[i]->HasKey(utt)) {
+            vectors[i + 1] = input[i]->Value(utt);
+          } else {
+            KALDI_WARN << "Missing utt " << utt << " from input "
+                       << po.GetArg(i+2);
+            num_err++;
+            break;
+          }
+        }
+        if (i != static_cast<int32>(input.size()))
+          continue;
+        Vector<BaseFloat> output;
+        if (!MultiplyVectors(vectors, utt, length_tolerance, &output)) {
+          num_err++;
+          continue; // it will have printed a warning.
+        }
+        vector_writer.Write(utt, output);
+        num_done++;
+      }
+
+      for (int32 i=0; i < input.size(); i++)
+        delete input[i];
+      input.clear();
+
+      KALDI_LOG << "Done " << num_done << " utts, errors on "
+                << num_err;
+
+      return (num_done == 0 ? -1 : 0);
+    } else {
+      // We're operating on rxfilenames|wxfilenames, most likely files.
+      std::vector<Vector<BaseFloat> > vectors(po.NumArgs() - 1);
+      for (int32 i = 1; i < po.NumArgs(); i++)
+        ReadKaldiObject(po.GetArg(i), &(vectors[i-1]));
+      Vector<BaseFloat> output;
+      if (!MultiplyVectors(vectors, "", length_tolerance, &output))
+        return 1; // it will have printed a warning.
+      std::string output_wxfilename = po.GetArg(po.NumArgs());
+      WriteKaldiObject(output, output_wxfilename, binary);
+      KALDI_LOG << "Wrote multiplied vector to " << output_wxfilename;
+      return 0;
+    }
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/featbin/paste-vectors.cc b/src/featbin/paste-vectors.cc
new file mode 100644
index 00000000000..06d373ba7e0
--- /dev/null
+++ b/src/featbin/paste-vectors.cc
@@ -0,0 +1,138 @@
+// featbin/paste-vectors.cc
+
+// Copyright 2012 Korbinian Riedhammer
+//           2013 Brno University of Technology (Author: Karel Vesely)
+//           2013 Johns Hopkins University (Author: Daniel Povey)
+//           2020 Ivan Medennikov (STC-innovation Ltd)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "matrix/kaldi-matrix.h"
+
+namespace kaldi {
+
+// returns true if successfully appended.
+bool AppendVectors(const std::vector<Vector<BaseFloat> > &in,
+                 std::string utt,
+                 Vector<BaseFloat> *out) {
+  // Check the lengths
+  int32 tot_dim = in[0].Dim();
+  for (int32 i = 1; i < in.size(); i++) {
+    int32 dim = in[i].Dim();
+    tot_dim += dim;
+  }
+  out->Resize(tot_dim);
+  int32 dim_offset = 0;
+  for (int32 i = 0; i < in.size(); i++) {
+    int32 this_dim = in[i].Dim();
+    out->Range(dim_offset, this_dim).CopyFromVec(
+        in[i].Range(0, this_dim));
+    dim_offset += this_dim;
+  }
+  return true;
+}
+
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace std;
+
+    const char *usage =
+        "Paste vector files \n"
+        "Usage: paste-vectors <in-rspecifier1> <in-rspecifier2> [<in-rspecifier3> ...] <out-wspecifier>\n"
+        "See also: paste-feats, copy-feats, copy-matrix, append-vector-to-feats, concat-feats\n";
+
+    ParseOptions po(usage);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    if (ClassifyRspecifier(po.GetArg(1), NULL, NULL)
+        != kNoRspecifier) {
+      // We're operating on tables, e.g. archives.
+
+      // Last argument is output
+      string wspecifier = po.GetArg(po.NumArgs());
+      BaseFloatVectorWriter vec_writer(wspecifier);
+
+      // First input is sequential
+      string rspecifier1 = po.GetArg(1);
+      SequentialBaseFloatVectorReader input1(rspecifier1);
+
+      // Assemble vector of other input readers (with random-access)
+      vector<RandomAccessBaseFloatVectorReader *> input;
+      for (int32 i = 2; i < po.NumArgs(); i++) {
+        string rspecifier = po.GetArg(i);
+        RandomAccessBaseFloatVectorReader *rd = new RandomAccessBaseFloatVectorReader(rspecifier);
+        input.push_back(rd);
+      }
+
+      int32 num_done = 0, num_err = 0;
+
+      // Main loop
+      for (; !input1.Done(); input1.Next()) {
+        string utt = input1.Key();
+        KALDI_VLOG(2) << "Merging vectors for utterance " << utt;
+
+        // Collect vectors from streams to vector 'vectors'
+        vector<Vector<BaseFloat> > vectors(po.NumArgs() - 1);
+        vectors[0] = input1.Value();
+        int32 i;
+        for (i = 0; i < static_cast<int32>(input.size()); i++) {
+          if (input[i]->HasKey(utt)) {
+            vectors[i + 1] = input[i]->Value(utt);
+          } else {
+            KALDI_WARN << "Missing utt " << utt << " from input "
+                       << po.GetArg(i+2);
+            num_err++;
+            break;
+          }
+        }
+        if (i != static_cast<int32>(input.size()))
+          continue;
+        Vector<BaseFloat> output;
+        if (!AppendVectors(vectors, utt, &output)) {
+          num_err++;
+          continue; // it will have printed a warning.
+        }
+        vec_writer.Write(utt, output);
+        num_done++;
+      }
+
+      for (int32 i=0; i < input.size(); i++)
+        delete input[i];
+      input.clear();
+
+      KALDI_LOG << "Done " << num_done << " utts, errors on "
+                << num_err;
+
+      return (num_done == 0 ? -1 : 0);
+    } 
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}