From 167066292eceb12b1604cc9d6581b766ad061b09 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Fri, 13 Nov 2020 16:13:10 -0500
Subject: [PATCH] [egs] LibriCSS recipe (#4321)

Refer to the README.md to each eg directory for description.
---
 .../v1/diarization/vb_hmm_xvector.py          |  89 +------
 .../v1/diarization/vb_hmm_xvector.sh          |  23 +-
 egs/libri_css/README.md                       |  63 +++++
 egs/libri_css/s5_css/cmd.sh                   |  14 +
 egs/libri_css/s5_css/conf/mfcc.conf           |   2 +
 egs/libri_css/s5_css/conf/mfcc_hires.conf     |  10 +
 egs/libri_css/s5_css/conf/online_cmvn.conf    |   1 +
 egs/libri_css/s5_css/diarization              |   1 +
 egs/libri_css/s5_css/local                    |   1 +
 egs/libri_css/s5_css/path.sh                  |   9 +
 egs/libri_css/s5_css/rnnlm                    |   1 +
 egs/libri_css/s5_css/run.sh                   | 243 ++++++++++++++++++
 egs/libri_css/s5_css/sid                      |   1 +
 egs/libri_css/s5_css/steps                    |   1 +
 egs/libri_css/s5_css/utils                    |   1 +
 egs/libri_css/s5_mono/cmd.sh                  |  14 +
 egs/libri_css/s5_mono/conf/mfcc.conf          |   2 +
 egs/libri_css/s5_mono/conf/mfcc_hires.conf    |  10 +
 egs/libri_css/s5_mono/conf/online_cmvn.conf   |   1 +
 egs/libri_css/s5_mono/diarization             |   1 +
 .../s5_mono/local/best_wer_matching.py        |  72 ++++++
 .../s5_mono/local/chain/run_chain_common.sh   |  80 ++++++
 egs/libri_css/s5_mono/local/chain/run_tdnn.sh |   1 +
 .../s5_mono/local/chain/tuning/run_tdnn_1d.sh | 171 ++++++++++++
 .../local/chain/tuning/run_tdnn_1d_ft.sh      | 241 +++++++++++++++++
 .../convert_rttm_to_utt2spk_and_segments.py   |  98 +++++++
 egs/libri_css/s5_mono/local/data_prep_css.sh  |  94 +++++++
 .../s5_mono/local/data_prep_librispeech.sh    |  79 ++++++
 egs/libri_css/s5_mono/local/data_prep_mono.sh |  89 +++++++
 egs/libri_css/s5_mono/local/decode.sh         | 227 ++++++++++++++++
 .../s5_mono/local/decode_diarized.sh          |  78 ++++++
 .../s5_mono/local/decode_diarized_css.sh      |  84 ++++++
 egs/libri_css/s5_mono/local/decode_oracle.sh  | 125 +++++++++
 .../s5_mono/local/detect_speech_activity.sh   | 225 ++++++++++++++++
 .../diarization/post_process_css_rttm.py      | 121 +++++++++
 .../s5_mono/local/diarization/scluster.sh     | 100 +++++++
 egs/libri_css/s5_mono/local/diarize.sh        |  87 +++++++
 egs/libri_css/s5_mono/local/diarize_css.sh    | 131 ++++++++++
 .../s5_mono/local/diarize_spectral.sh         |  81 ++++++
 .../s5_mono/local/download_and_untar.sh       | 100 +++++++
 .../s5_mono/local/download_diarizer.sh        |  38 +++
 egs/libri_css/s5_mono/local/download_lm.sh    |  76 ++++++
 egs/libri_css/s5_mono/local/dscore.sh         |  70 +++++
 .../s5_mono/local/extract_vad_weights.sh      |  86 +++++++
 egs/libri_css/s5_mono/local/format_lms.sh     |  60 +++++
 .../s5_mono/local/get_perspeaker_output.py    |  91 +++++++
 egs/libri_css/s5_mono/local/make_voxceleb1.pl | 130 ++++++++++
 egs/libri_css/s5_mono/local/make_voxceleb2.pl |  70 +++++
 .../s5_mono/local/multispeaker_score.sh       | 111 ++++++++
 egs/libri_css/s5_mono/local/nnet3/decode.sh   | 163 ++++++++++++
 .../s5_mono/local/nnet3/run_ivector_common.sh | 149 +++++++++++
 .../local/nnet3/xvector/prepare_feats.sh      |  89 +++++++
 .../nnet3/xvector/prepare_feats_for_egs.sh    |  83 ++++++
 .../local/nnet3/xvector/run_xvector.sh        |   1 +
 .../nnet3/xvector/tuning/run_xvector_1a.sh    | 149 +++++++++++
 egs/libri_css/s5_mono/local/prepare_data.py   | 104 ++++++++
 .../s5_mono/local/prepare_data_css.py         |  92 +++++++
 egs/libri_css/s5_mono/local/prepare_dict.sh   | 143 +++++++++++
 egs/libri_css/s5_mono/local/rnnlm/train.sh    |   1 +
 .../local/rnnlm/tuning/run_tdnn_lstm_1a.sh    | 130 ++++++++++
 .../s5_mono/local/run_cleanup_segmentation.sh |  63 +++++
 egs/libri_css/s5_mono/local/score.sh          |   1 +
 .../s5_mono/local/score_reco_diarized.sh      | 147 +++++++++++
 .../s5_mono/local/score_reco_oracle.sh        | 107 ++++++++
 .../local/segmentation/apply_webrtcvad.py     | 212 +++++++++++++++
 .../segmentation/detect_speech_activity.sh    | 217 ++++++++++++++++
 egs/libri_css/s5_mono/local/train_asr.sh      | 205 +++++++++++++++
 egs/libri_css/s5_mono/local/train_diarizer.sh | 186 ++++++++++++++
 egs/libri_css/s5_mono/local/wer_output_filter |  25 ++
 egs/libri_css/s5_mono/path.sh                 |  10 +
 egs/libri_css/s5_mono/rnnlm                   |   1 +
 egs/libri_css/s5_mono/run.sh                  |  99 +++++++
 egs/libri_css/s5_mono/sid                     |   1 +
 egs/libri_css/s5_mono/steps                   |   1 +
 egs/libri_css/s5_mono/utils                   |   1 +
 75 files changed, 5787 insertions(+), 97 deletions(-)
 create mode 100644 egs/libri_css/README.md
 create mode 100644 egs/libri_css/s5_css/cmd.sh
 create mode 100644 egs/libri_css/s5_css/conf/mfcc.conf
 create mode 100644 egs/libri_css/s5_css/conf/mfcc_hires.conf
 create mode 100644 egs/libri_css/s5_css/conf/online_cmvn.conf
 create mode 120000 egs/libri_css/s5_css/diarization
 create mode 120000 egs/libri_css/s5_css/local
 create mode 100644 egs/libri_css/s5_css/path.sh
 create mode 120000 egs/libri_css/s5_css/rnnlm
 create mode 100755 egs/libri_css/s5_css/run.sh
 create mode 120000 egs/libri_css/s5_css/sid
 create mode 120000 egs/libri_css/s5_css/steps
 create mode 120000 egs/libri_css/s5_css/utils
 create mode 100644 egs/libri_css/s5_mono/cmd.sh
 create mode 100644 egs/libri_css/s5_mono/conf/mfcc.conf
 create mode 100644 egs/libri_css/s5_mono/conf/mfcc_hires.conf
 create mode 100644 egs/libri_css/s5_mono/conf/online_cmvn.conf
 create mode 120000 egs/libri_css/s5_mono/diarization
 create mode 100755 egs/libri_css/s5_mono/local/best_wer_matching.py
 create mode 100755 egs/libri_css/s5_mono/local/chain/run_chain_common.sh
 create mode 120000 egs/libri_css/s5_mono/local/chain/run_tdnn.sh
 create mode 100755 egs/libri_css/s5_mono/local/chain/tuning/run_tdnn_1d.sh
 create mode 100755 egs/libri_css/s5_mono/local/chain/tuning/run_tdnn_1d_ft.sh
 create mode 100755 egs/libri_css/s5_mono/local/convert_rttm_to_utt2spk_and_segments.py
 create mode 100755 egs/libri_css/s5_mono/local/data_prep_css.sh
 create mode 100755 egs/libri_css/s5_mono/local/data_prep_librispeech.sh
 create mode 100755 egs/libri_css/s5_mono/local/data_prep_mono.sh
 create mode 100755 egs/libri_css/s5_mono/local/decode.sh
 create mode 100755 egs/libri_css/s5_mono/local/decode_diarized.sh
 create mode 100755 egs/libri_css/s5_mono/local/decode_diarized_css.sh
 create mode 100755 egs/libri_css/s5_mono/local/decode_oracle.sh
 create mode 100755 egs/libri_css/s5_mono/local/detect_speech_activity.sh
 create mode 100755 egs/libri_css/s5_mono/local/diarization/post_process_css_rttm.py
 create mode 100755 egs/libri_css/s5_mono/local/diarization/scluster.sh
 create mode 100755 egs/libri_css/s5_mono/local/diarize.sh
 create mode 100755 egs/libri_css/s5_mono/local/diarize_css.sh
 create mode 100755 egs/libri_css/s5_mono/local/diarize_spectral.sh
 create mode 100755 egs/libri_css/s5_mono/local/download_and_untar.sh
 create mode 100755 egs/libri_css/s5_mono/local/download_diarizer.sh
 create mode 100755 egs/libri_css/s5_mono/local/download_lm.sh
 create mode 100644 egs/libri_css/s5_mono/local/dscore.sh
 create mode 100755 egs/libri_css/s5_mono/local/extract_vad_weights.sh
 create mode 100755 egs/libri_css/s5_mono/local/format_lms.sh
 create mode 100755 egs/libri_css/s5_mono/local/get_perspeaker_output.py
 create mode 100755 egs/libri_css/s5_mono/local/make_voxceleb1.pl
 create mode 100755 egs/libri_css/s5_mono/local/make_voxceleb2.pl
 create mode 100755 egs/libri_css/s5_mono/local/multispeaker_score.sh
 create mode 100755 egs/libri_css/s5_mono/local/nnet3/decode.sh
 create mode 100755 egs/libri_css/s5_mono/local/nnet3/run_ivector_common.sh
 create mode 100755 egs/libri_css/s5_mono/local/nnet3/xvector/prepare_feats.sh
 create mode 100755 egs/libri_css/s5_mono/local/nnet3/xvector/prepare_feats_for_egs.sh
 create mode 120000 egs/libri_css/s5_mono/local/nnet3/xvector/run_xvector.sh
 create mode 100755 egs/libri_css/s5_mono/local/nnet3/xvector/tuning/run_xvector_1a.sh
 create mode 100755 egs/libri_css/s5_mono/local/prepare_data.py
 create mode 100755 egs/libri_css/s5_mono/local/prepare_data_css.py
 create mode 100755 egs/libri_css/s5_mono/local/prepare_dict.sh
 create mode 120000 egs/libri_css/s5_mono/local/rnnlm/train.sh
 create mode 100755 egs/libri_css/s5_mono/local/rnnlm/tuning/run_tdnn_lstm_1a.sh
 create mode 100755 egs/libri_css/s5_mono/local/run_cleanup_segmentation.sh
 create mode 120000 egs/libri_css/s5_mono/local/score.sh
 create mode 100755 egs/libri_css/s5_mono/local/score_reco_diarized.sh
 create mode 100755 egs/libri_css/s5_mono/local/score_reco_oracle.sh
 create mode 100755 egs/libri_css/s5_mono/local/segmentation/apply_webrtcvad.py
 create mode 100755 egs/libri_css/s5_mono/local/segmentation/detect_speech_activity.sh
 create mode 100755 egs/libri_css/s5_mono/local/train_asr.sh
 create mode 100755 egs/libri_css/s5_mono/local/train_diarizer.sh
 create mode 100755 egs/libri_css/s5_mono/local/wer_output_filter
 create mode 100644 egs/libri_css/s5_mono/path.sh
 create mode 120000 egs/libri_css/s5_mono/rnnlm
 create mode 100755 egs/libri_css/s5_mono/run.sh
 create mode 120000 egs/libri_css/s5_mono/sid
 create mode 120000 egs/libri_css/s5_mono/steps
 create mode 120000 egs/libri_css/s5_mono/utils

diff --git a/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.py b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.py
index 87625a29b25..a284abbeb4a 100644
--- a/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.py
+++ b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # Copyright 2020 Johns Hopkins University (Author: Desh Raj)
 # Apache 2.0
 
@@ -9,7 +9,7 @@
 # vb_hmm_xvector.sh which can divide all labels into per recording
 # labels.
 
-import sys, argparse, struct, re
+import sys, argparse, struct
 import numpy as np
 import itertools
 import kaldi_io
@@ -36,9 +36,6 @@ def get_args():
                         help="scale sufficient statistics collected using UBM")
     parser.add_argument("--fb", type=float, default=11,
                         help="speaker regularization coefficient Fb (controls final # of speaker)")
-    parser.add_argument("--overlap-rttm", type=str,
-                        help="path to an RTTM file containing overlap segments. If provided,"
-                        "multiple speaker labels will be allocated to these segments.")
     parser.add_argument("xvector_ark_file", type=str,
                         help="Ark file containing xvectors for all subsegments")
     parser.add_argument("plda", type=str,
@@ -61,59 +58,12 @@ def read_labels_file(label_file):
     return segments, labels
 
 def write_labels_file(seg2label, out_file):
-    f = open(out_file, 'w')
-    for seg in sorted(seg2label.keys()):
-        label = seg2label[seg]
-        if type(label) is tuple:
-            f.write("{} {}\n".format(seg, label[0]))
-            f.write("{} {}\n".format(seg, label[1]))
-        else:
-            f.write("{} {}\n".format(seg, label))
-    f.close()
+    with open(out_file, 'w') as f:
+        for seg in sorted(seg2label.keys()):
+            label = seg2label[seg]
+            f.write(f"{seg} {label}\n")
     return
 
-def get_overlap_decision(overlap_segs, subsegment, frac = 0.5):
-    """ Returns true if at least 'frac' fraction of the subsegment lies
-    in the overlap_segs."""
-    start_time = subsegment[0]
-    end_time = subsegment[1]
-    dur = end_time - start_time
-    total_ovl = 0
-    
-    for seg in overlap_segs:
-        cur_start, cur_end = seg
-        if (cur_start >= end_time):
-            break
-        ovl_start = max(start_time, cur_start)
-        ovl_end = min(end_time, cur_end)
-        ovl_time = max(0, ovl_end-ovl_start)
-
-        total_ovl += ovl_time
-    
-    return (total_ovl >= frac * dur)
-
-
-def get_overlap_vector(overlap_rttm, segments):
-    reco_id = '_'.join(segments[0].split('_')[:3])
-    overlap_segs = []
-    with open(overlap_rttm, 'r') as f:
-        for line in f.readlines():
-            parts = line.strip().split()
-            if (parts[1] == reco_id):
-                overlap_segs.append((float(parts[3]), float(parts[3]) + float(parts[4])))
-    ol_vec = np.zeros(len(segments))
-    overlap_segs.sort(key=lambda x: x[0])
-    for i, segment in enumerate(segments):
-        parts = re.split('_|-',segment)
-        start_time = (float(parts[3]) + float(parts[5]))/100
-        end_time = (float(parts[3]) + float(parts[6]))/100
-
-        is_overlap = get_overlap_decision(overlap_segs, (start_time, end_time))
-        if is_overlap:
-            ol_vec[i] = 1
-    print ("{}: {} fraction of segments are overlapping".format(id, ol_vec.sum()/len(ol_vec)))
-    return ol_vec
-
 def read_args(args):
     segments, labels = read_labels_file(args.input_label_file)
     xvec_all = dict(kaldi_io.read_vec_flt_ark(args.xvector_ark_file))
@@ -121,17 +71,12 @@ def read_args(args):
     for segment in segments:
         xvectors.append(xvec_all[segment])
     _, _, plda_psi = kaldi_io.read_plda(args.plda)
-    if (args.overlap_rttm is not None):
-        print('Getting overlap segments...')
-        overlaps = get_overlap_vector(args.overlap_rttm, segments)
-    else:
-        overlaps = None
-    return xvectors, segments, labels, plda_psi, overlaps
+    return xvectors, segments, labels, plda_psi
 
 
 ###################################################################
 
-def vb_hmm(segments, in_labels, xvectors, overlaps, plda_psi, init_smoothing, loop_prob, fa, fb):
+def vb_hmm(segments, in_labels, xvectors, plda_psi, init_smoothing, loop_prob, fa, fb):
     x = np.array(xvectors)
     dim = x.shape[1]
 
@@ -153,25 +98,15 @@ def vb_hmm(segments, in_labels, xvectors, overlaps, plda_psi, init_smoothing, lo
         gamma=q_init, maxSpeakers=q_init.shape[1], maxIters=40, epsilon=1e-6, loopProb=loop_prob,
         Fa=fa, Fb=fb)
 
-    labels = np.argsort(q, axis=1)[:,[-1,-2]]
+    labels = np.unique(q.argmax(1), return_inverse=True)[1] 
 
-    if overlaps is not None:
-        final_labels = []
-        for i in range(len(overlaps)):
-            if (overlaps[i] == 1):
-                final_labels.append((labels[i,0], labels[i,1]))
-            else:
-                final_labels.append(labels[i,0])
-    else:
-        final_labels = labels[:,0]
-    
-    return {seg:label for seg,label in zip(segments,final_labels)}
+    return {seg:label for seg,label in zip(segments,labels)}
 
 def main():
     args = get_args()
-    xvectors, segments, labels, plda_psi, overlaps = read_args(args)
+    xvectors, segments, labels, plda_psi = read_args(args)
 
-    seg2label_vb = vb_hmm(segments, labels, xvectors, overlaps, plda_psi, args.init_smoothing, 
+    seg2label_vb = vb_hmm(segments, labels, xvectors, plda_psi, args.init_smoothing, 
         args.loop_prob, args.fa, args.fb)
     write_labels_file(seg2label_vb, args.output_label_file)
 
diff --git a/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh
index 5badd747d5d..081219ff2a4 100755
--- a/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh
+++ b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh
@@ -13,7 +13,6 @@ stage=0
 nj=10
 cleanup=true
 rttm_channel=0
-overlap_rttm=  # Path to an RTTM output of an external overlap detector
 
 # The hyperparameters used here are taken from the DIHARD
 # optimal hyperparameter values reported in:
@@ -69,14 +68,6 @@ if [ "$result" == "0" ]; then
     python3 -m pip install numexpr
 fi
 
-overlap_rttm_opt=
-if ! [ -z "$overlap_rttm" ]; then
-  overlap_rttm_opt="--overlap-rttm $overlap_rttm"
-  rttm_bin="make_rttm_ol.py"
-else
-  rttm_bin="make_rttm.py"
-fi
-
 if [ $stage -le 0 ]; then
   # Mean subtraction (If original x-vectors are high-dim, e.g. 512, you should
   # consider also applying LDA to reduce dimensionality to, say, 200) 
@@ -85,18 +76,10 @@ if [ $stage -le 0 ]; then
 fi
 
 echo -e "Performing bayesian HMM based x-vector clustering..\n"
-# making a shell script for each job
-for n in `seq $nj`; do
-  cat <<-EOF > $dir/tmp/vb_hmm.$n.sh
-  python3 diarization/vb_hmm_xvector.py $overlap_rttm_opt \
-      --loop-prob $loop_prob --fa $fa --fb $fb \
-      $xvec_dir/xvector_norm.ark $plda $dir/labels.$n $dir/labels.vb.$n
-EOF
-done
-
-chmod a+x $dir/tmp/vb_hmm.*.sh
 $cmd JOB=1:$nj $dir/log/vb_hmm.JOB.log \
-  $dir/tmp/vb_hmm.JOB.sh
+  diarization/vb_hmm_xvector.py \
+    --loop-prob $loop_prob --fa $fa --fb $fb \
+    $xvec_dir/xvector_norm.ark $plda $dir/labels.JOB $dir/labels.vb.JOB
 
 if [ $stage -le 1 ]; then
   echo "$0: combining labels"
diff --git a/egs/libri_css/README.md b/egs/libri_css/README.md
new file mode 100644
index 00000000000..b0901d5865e
--- /dev/null
+++ b/egs/libri_css/README.md
@@ -0,0 +1,63 @@
+### LibriCSS integrated recipe
+
+This is a Kaldi recipe for the LibriCSS data, providing diarization and
+ASR on mixed single-channel and separated audio inputs. 
+
+#### Data
+We use the LibriCSS data released with the following paper:
+```
+@article{Chen2020ContinuousSS,
+  title={Continuous Speech Separation: Dataset and Analysis},
+  author={Z. Chen and T. Yoshioka and Liang Lu and T. Zhou and Zhong Meng and Yi Luo and J. Wu and J. Li},
+  journal={ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  year={2020}
+}
+```
+For the official data and code, check out [the official repo](https://github.com/chenzhuo1011/libri_css).
+
+#### Recipe details
+This recipe addresses the problem of speech recognition in a meeting-like
+scenario, where multiple overlapping speakers may be present, and the 
+number of speakers is not known beforehand. 
+
+We provide recipes for 2 scenarios:
+1. `s5_mono`: This is a single channel diarization + ASR recipe which takes as the
+input a long single-channel recording containing mixed audio. It then performs SAD,
+diarization, and ASR on it and outputs speaker-attributed transcriptions, 
+which are then evaluated with cpWER (similar to CHiME6 Track 2).
+2. `s5_css`: This pipeline uses a speech separation module at the beginning,
+so the input is 2-3 separated audio streams. We assume that the separation is
+window-based, so that the same speaker may be split across different streams in
+different windows, thus making diarization necessary.
+
+#### Pretrained models for diarization and ASR
+For ease of reproduction, we include the training for both modules in the
+recipe. We also provide pretrained models for both diarization and ASR 
+systems.
+
+* SAD: CHiME-6 baseline TDNN-Stats SAD available [here](http://kaldi-asr.org/models/m12).
+* Speaker diarization: CHiME-6 baseline x-vector + AHC diarizer, trained on VoxCeleb 
+with simulated RIRs available [here](http://kaldi-asr.org/models/m12).
+* ASR: We used the chain model trained on 960h clean LibriSpeech training data available
+[here](http://kaldi-asr.org/models/m13). It was then additionally fine-tuned for 1
+epoch on LibriSpeech + simulated RIRs. For LM, we trained a TDNN-LSTM language model
+for rescoring. All of these models are available at this 
+[Google Drive link](https://drive.google.com/file/d/13ceXdK6oAUuUyxn7kjQVVqpe8r6Sc7ds/view?usp=sharing).
+
+#### Speech separation
+The speech separation module has not been provided. If you want to use the
+`s5_css` recipe, check out [this tutorial](https://desh2608.github.io/pages/jsalt/) for
+instructions on how to plug in your component into the pipeline.
+
+If you found this recipe useful for your experiments, consider citing:
+
+```
+@article{Raj2021Integration,
+  title={Integration of speech separation, diarization, and recognition for multi-speaker meetings:
+System description, Comparison, and Analysis},
+  author={D.Raj and P.Denisov and Z.Chen and H.Erdogan and Z.Huang and M.He and S.Watanabe and
+  J.Du and T.Yoshioka and Y.Luo and N.Kanda and J.Li and S.Wisdom and J.Hershey},
+  journal={IEEE Spoken Language Technology Workshop 2021},
+  year={2021}
+}
+```
\ No newline at end of file
diff --git a/egs/libri_css/s5_css/cmd.sh b/egs/libri_css/s5_css/cmd.sh
new file mode 100644
index 00000000000..86514d94d4d
--- /dev/null
+++ b/egs/libri_css/s5_css/cmd.sh
@@ -0,0 +1,14 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="retry.pl queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
diff --git a/egs/libri_css/s5_css/conf/mfcc.conf b/egs/libri_css/s5_css/conf/mfcc.conf
new file mode 100644
index 00000000000..32988403b00
--- /dev/null
+++ b/egs/libri_css/s5_css/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false
+--sample-frequency=16000
diff --git a/egs/libri_css/s5_css/conf/mfcc_hires.conf b/egs/libri_css/s5_css/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..fd64b62eb16
--- /dev/null
+++ b/egs/libri_css/s5_css/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 
+--num-mel-bins=40
+--num-ceps=40
+--low-freq=40
+--high-freq=-400
diff --git a/egs/libri_css/s5_css/conf/online_cmvn.conf b/egs/libri_css/s5_css/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/libri_css/s5_css/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/libri_css/s5_css/diarization b/egs/libri_css/s5_css/diarization
new file mode 120000
index 00000000000..bad937c1444
--- /dev/null
+++ b/egs/libri_css/s5_css/diarization
@@ -0,0 +1 @@
+../../callhome_diarization/v1/diarization
\ No newline at end of file
diff --git a/egs/libri_css/s5_css/local b/egs/libri_css/s5_css/local
new file mode 120000
index 00000000000..2757f389a5b
--- /dev/null
+++ b/egs/libri_css/s5_css/local
@@ -0,0 +1 @@
+../s5_mono/local
\ No newline at end of file
diff --git a/egs/libri_css/s5_css/path.sh b/egs/libri_css/s5_css/path.sh
new file mode 100644
index 00000000000..2f4e4e4fb21
--- /dev/null
+++ b/egs/libri_css/s5_css/path.sh
@@ -0,0 +1,9 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH
+export PATH=$PWD/dscore:$PATH
+export PYTHONPATH="${PYTHONPATH}:$PWD/dscore"
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
diff --git a/egs/libri_css/s5_css/rnnlm b/egs/libri_css/s5_css/rnnlm
new file mode 120000
index 00000000000..72302c5e570
--- /dev/null
+++ b/egs/libri_css/s5_css/rnnlm
@@ -0,0 +1 @@
+../../../scripts/rnnlm
\ No newline at end of file
diff --git a/egs/libri_css/s5_css/run.sh b/egs/libri_css/s5_css/run.sh
new file mode 100755
index 00000000000..6982983a448
--- /dev/null
+++ b/egs/libri_css/s5_css/run.sh
@@ -0,0 +1,243 @@
+#!/usr/bin/env bash
+#
+# LibriCSS pipeline containing speech separation. We don't provide
+# any training stages for diarization or ASR here, since they are 
+# the same as those in s5_mono. As such, this run script is
+# actually a decoding script. Before running this script, you
+# need to have run your separation module (or use the separated
+# audio streams we have provided), and the output streams must
+# be named in the following naming convention:
+# overlap_ratio_10.0_sil0.1_1.0_session7_actual10.1_channel_1.wav
+# Here, "channel" denotes the stream number, for example, if your
+# method separates the audio into 3 streams, they should be named
+# channel_0, channel_1, and channel_2. The wav files can be organized
+# in any hierarchy within the directory, but this naming
+# convention must be followed.
+# 
+# Copyright  2020  Johns Hopkins University (Author: Desh Raj)
+# Apache 2.0
+
+# Begin configuration section.
+nj=50
+decode_nj=20
+stage=0
+
+nnet3_affix=_cleaned
+affix=1d_ft
+data_affix= # This can be used to distinguish between different data sources
+sad_type=tdnn # Set this to webrtc or tdnn
+
+# Different stages
+sad_stage=0
+diarizer_stage=1
+decode_diarize_stage=0
+score_stage=0
+rnnlm_rescore=true
+
+# RNNLM rescore options
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+pruned_rescore=true
+rnnlm_dir=exp/rnnlm_lstm_1a
+
+set -e # exit on error
+
+# End configuration section
+. ./utils/parse_options.sh
+
+. ./cmd.sh
+. ./path.sh
+
+test_sets="dev${data_affix} eval${data_affix}"
+
+# Get dev and eval set names from the test_sets
+dev_set=$( echo $test_sets | cut -d " " -f1 )
+eval_set=$( echo $test_sets | cut -d " " -f2 )
+
+# please change the path accordingly. We need the original LibriCSS
+# corpus to get the oracle segments (for evaluation purpose), and
+# also the path to the separated wav files
+libricss_corpus=/export/fs01/LibriCSS/
+
+# Separated wav files
+wav_files_dir=/export/c03/zhuc/css/connected_continuous_separation
+
+##########################################################################
+# We first prepare the CSS data in the Kaldi data format. We use session 0 
+# for dev and others for eval. Since separation has been performed before-
+# hand, each recording will contain multiple streams. We do not make any
+# assumptions on the number of streams, so that this recipe is extensible
+# to other speech separation methods. However, the following script may
+# need to be modified depending on the naming conventions used for the
+# wav files.
+##########################################################################
+if [ $stage -le 0 ]; then
+  local/data_prep_css.sh --data-affix "$data_affix" \
+    $libricss_corpus $wav_files_dir
+fi
+
+#######################################################################
+# Perform SAD on the dev/eval data using py-webrtcvad package
+#######################################################################
+
+if [ $stage -le 1 ]; then
+  for datadir in ${test_sets}; do
+    test_set=data/${datadir}
+    if [ $sad_type == "webrtc" ]; then
+      echo "Applying WebRTC-VAD on ${datadir}"
+      local/segmentation/apply_webrtcvad.py --mode 2 $test_set | sort > $test_set/segments
+    else
+      echo "Applying TDNN-Stats-SAD on ${datadir}"
+      if [ ! -f ${test_set}/wav.scp ]; then
+        echo "$0: Not performing SAD on ${test_set}"
+        exit 0
+      fi
+
+      sad_nj=$(wc -l < "$test_set/wav.scp")
+      nj=$(echo $((decode_nj>sad_nj ? sad_nj : decode_nj)))
+      # Perform segmentation. We use the pretrained SAD available at:
+      # http://kaldi-asr.org/models/4/0004_tdnn_stats_asr_sad_1a.tar.gz
+      # Download and extract using tar -xvzf
+      if [ ! -d exp/segmentation_1a/tdnn_stats_asr_sad_1a ]; then
+        wget http://kaldi-asr.org/models/4/0004_tdnn_stats_asr_sad_1a.tar.gz
+        tar -xvzf 0004_tdnn_stats_asr_sad_1a.tar.gz
+      fi
+      local/detect_speech_activity.sh --cmd "$decode_cmd" --nj $nj $test_set \
+        exp/segmentation_1a/tdnn_stats_asr_sad_1a
+      
+      # The pretrained SAD used a different MFCC config. We need to
+      # copy back our old config files.
+      cp -r ../s5_mono/conf .
+    fi
+
+    # Create dummy utt2spk file from obtained segments
+    awk '{print $1, $2}' ${test_set}/segments > ${test_set}/utt2spk
+    utils/utt2spk_to_spk2utt.pl ${test_set}/utt2spk > ${test_set}/spk2utt
+    
+    # Generate RTTM file from segmentation performed by SAD. This can
+    # be used to evaluate the performance of the SAD as an intermediate
+    # step. Note that we remove the "stream" from the segments file reco_id
+    # here because our ground truth does not have these. This means that
+    # we will have overlapping segments, but that is allowed in the evaluation.
+    awk '{$2=$2;sub(/_[0-9]*$/, "", $2); print}' ${test_set}/segments > ${test_set}/segments.score
+    steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \
+      ${test_set}/utt2spk ${test_set}/segments.score ${test_set}/rttm
+    rm $test_set/segments.score
+
+    echo "Scoring $datadir.."
+    # We first generate the reference RTTM from the backed up utt2spk and segments
+    # files.
+    ref_rttm=${test_set}/ref_rttm
+    steps/segmentation/convert_utt2spk_and_segments_to_rttm.py ${test_set}/utt2spk.bak \
+      ${test_set}/segments.bak ${test_set}/ref_rttm
+
+    md-eval.pl -r $ref_rttm -s ${test_set}/rttm |\
+      awk 'or(/MISSED SPEECH/,/FALARM SPEECH/)'
+    
+  done
+fi
+
+#######################################################################
+# Feature extraction for the dev and eval data
+#######################################################################
+if [ $stage -le 2 ]; then
+  # mfccdir should be some place with a largish disk where you
+  # want to store MFCC features.
+  mfccdir=mfcc
+  for x in ${test_sets}; do
+    utils/fix_data_dir.sh data/$x
+    nj=$(wc -l < "data/$x/wav.scp")
+    steps/make_mfcc.sh --nj $decode_nj --cmd "$train_cmd" \
+      --mfcc-config conf/mfcc_hires.conf \
+      data/$x exp/make_mfcc/$x $mfccdir
+  done
+fi
+
+#######################################################################
+# Perform diarization on the dev/eval data
+#######################################################################
+if [ $stage -le 3 ]; then
+  for datadir in ${test_sets}; do
+    ref_rttm=data/${datadir}/ref_rttm
+    steps/segmentation/convert_utt2spk_and_segments_to_rttm.py data/${datadir}/utt2spk.bak \
+      data/${datadir}/segments.bak $ref_rttm
+    diar_nj=$(wc -l < "data/$datadir/wav.scp")
+
+    [ ! -d exp/xvector_nnet_1a ] && ./local/download_diarizer.sh
+
+    nj=$(echo $((decode_nj>diar_nj ? diar_nj : decode_nj)))
+    local/diarize_css.sh --nj $nj --cmd "$train_cmd" --stage $diarizer_stage \
+      --ref-rttm $ref_rttm --post-process-rttm true \
+      exp/xvector_nnet_1a \
+      data/${datadir} \
+      exp/${datadir}_diarization
+  done
+fi
+
+#######################################################################
+# Decode diarized output using trained chain model
+#######################################################################
+if [ $stage -le 4 ]; then
+  for datadir in ${test_sets}; do
+    asr_nj=$(wc -l < "data/$datadir/wav.scp")
+    nj=$(echo $((decode_nj>asr_nj ? asr_nj : decode_nj)))
+    local/decode_diarized_css.sh --nj $nj --cmd "$decode_cmd" --stage $decode_diarize_stage \
+      --lm-suffix "_tgsmall" --acwt 1.0 --post-decode-acwt 10.0 \
+      exp/${datadir}_diarization/rttm.post data/$datadir data/lang_test_tgsmall \
+      exp/chain${nnet3_affix}/tdnn_${affix} exp/nnet3${nnet3_affix} \
+      data/${datadir}_diarized || exit 1
+  done
+fi
+
+#######################################################################
+# Score decoded dev/eval sets (only if we are not rescoring)
+#######################################################################
+if [ $stage -le 5 ] && [ ! $rnnlm_rescore ]; then
+  # please specify both dev and eval set directories so that the search parameters
+  # (insertion penalty and language model weight) will be tuned using the dev set
+  local/score_reco_diarized.sh --cmd "$train_cmd" --stage $score_stage \
+      --multistream true \
+      --dev_decodedir exp/chain${nnet3_affix}/tdnn_${affix}/decode_${dev_set}_diarized_2stage \
+      --dev_datadir ${dev_set}_diarized_hires \
+      --eval_decodedir exp/chain${nnet3_affix}/tdnn_${affix}/decode_${eval_set}_diarized_2stage \
+      --eval_datadir ${eval_set}_diarized_hires
+fi
+
+############################################################################
+# RNNLM rescoring
+############################################################################
+if $rnnlm_rescore; then
+  if [ $stage -le 6 ]; then
+    echo "$0: Perform RNNLM lattice-rescoring"
+    pruned=
+    ac_model_dir=exp/chain${nnet3_affix}/tdnn_${affix}
+    if $pruned_rescore; then
+      pruned=_pruned
+    fi
+    for decode_set in $test_sets; do
+      decode_dir=${ac_model_dir}/decode_${decode_set}_diarized_2stage
+      # Lattice rescoring
+      rnnlm/lmrescore$pruned.sh \
+          --cmd "$decode_cmd --mem 8G" \
+          --weight 0.45 --max-ngram-order $ngram_order \
+          data/lang_test_tgsmall $rnnlm_dir \
+          data/${decode_set}_diarized_hires ${decode_dir} \
+          ${ac_model_dir}/decode_${decode_set}_diarized_2stage_rescore
+    done
+  fi
+  
+  if [ $stage -le 7 ]; then
+    echo "$0: WERs after rescoring with $rnnlm_dir"
+    local/score_reco_diarized.sh --cmd "$train_cmd" --stage $score_stage \
+        --multistream true \
+        --dev_decodedir exp/chain${nnet3_affix}/tdnn_${affix}/decode_${dev_set}_diarized_2stage_rescore \
+        --dev_datadir ${dev_set}_diarized_hires \
+        --eval_decodedir exp/chain${nnet3_affix}/tdnn_${affix}/decode_${eval_set}_diarized_2stage_rescore \
+        --eval_datadir ${eval_set}_diarized_hires
+  fi
+fi
+
+exit 0;
+
diff --git a/egs/libri_css/s5_css/sid b/egs/libri_css/s5_css/sid
new file mode 120000
index 00000000000..893a12f30c9
--- /dev/null
+++ b/egs/libri_css/s5_css/sid
@@ -0,0 +1 @@
+../../sre08/v1/sid
\ No newline at end of file
diff --git a/egs/libri_css/s5_css/steps b/egs/libri_css/s5_css/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/libri_css/s5_css/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/libri_css/s5_css/utils b/egs/libri_css/s5_css/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/libri_css/s5_css/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/libri_css/s5_mono/cmd.sh b/egs/libri_css/s5_mono/cmd.sh
new file mode 100644
index 00000000000..811adcde474
--- /dev/null
+++ b/egs/libri_css/s5_mono/cmd.sh
@@ -0,0 +1,14 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
diff --git a/egs/libri_css/s5_mono/conf/mfcc.conf b/egs/libri_css/s5_mono/conf/mfcc.conf
new file mode 100644
index 00000000000..32988403b00
--- /dev/null
+++ b/egs/libri_css/s5_mono/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false
+--sample-frequency=16000
diff --git a/egs/libri_css/s5_mono/conf/mfcc_hires.conf b/egs/libri_css/s5_mono/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..fd64b62eb16
--- /dev/null
+++ b/egs/libri_css/s5_mono/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 
+--num-mel-bins=40
+--num-ceps=40
+--low-freq=40
+--high-freq=-400
diff --git a/egs/libri_css/s5_mono/conf/online_cmvn.conf b/egs/libri_css/s5_mono/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/libri_css/s5_mono/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/libri_css/s5_mono/diarization b/egs/libri_css/s5_mono/diarization
new file mode 120000
index 00000000000..bad937c1444
--- /dev/null
+++ b/egs/libri_css/s5_mono/diarization
@@ -0,0 +1 @@
+../../callhome_diarization/v1/diarization
\ No newline at end of file
diff --git a/egs/libri_css/s5_mono/local/best_wer_matching.py b/egs/libri_css/s5_mono/local/best_wer_matching.py
new file mode 100755
index 00000000000..1eda3025652
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/best_wer_matching.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+# Copyright   2020   Desh Raj
+# Apache 2.0.
+
+import sys, io
+import itertools
+import numpy as np
+from scipy.optimize import linear_sum_assignment
+import math
+
+# This class stores all information about a ref/hyp matching
+class WerObject:
+    # By default, we set the errors to very high values to
+    # handle the error case.
+    id = ''
+    ref_id = ''
+    hyp_id= ''
+    wer = 0
+    num_ins = 0
+    num_del = 0
+    num_sub = 0
+    wc = 0
+
+    def __init__(self, line):
+        self.id, details = line.strip().split(maxsplit=1)
+        tokens = details.split()
+        self.wer = float(tokens[1])
+        self.wc = int(tokens[5][:-1])
+        self.num_ins = int(tokens[6])
+        self.num_del = int(tokens[8])
+        self.num_sub = int(tokens[10])
+        self.ref_id, self.hyp_id = self.id[1:].split('h')
+
+
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+
+# First we read all lines and create a list of WER objects
+wer_objects=[]
+for line in infile:
+    if not line or line.isspace():
+        continue
+    wer_object = WerObject(line)
+    wer_objects.append(wer_object)
+
+# Now we create a matrix of costs (WER) which we will use to solve
+# a linear sum assignment problem
+sort(wer_objects, key=lambda x: x.ref_id)
+wer_object_matrix = [list(g) for ref_id, g in itertools.groupby(wer_objects, lambda x: x.ref_id)]
+if len(wer_object_matrix) > len(wer_object_matrix[0]):
+    # More references than hypothesis; take transpose
+    wer_object_matrix = [*zip(*wer_object_matrix)]
+wer_matrix = np.array([[1000 if math.isnan(obj.wer) else obj.wer 
+    for obj in row] 
+    for row in wer_object_matrix])
+
+# Solve the assignment problem and compute WER statistics
+row_ind, col_ind = linear_sum_assignment(wer_matrix)
+total_ins = 0
+total_del = 0
+total_sub = 0
+total_wc = 0
+for row,col in zip(row_ind,col_ind):
+    total_ins += wer_object_matrix[row][col].num_ins
+    total_del += wer_object_matrix[row][col].num_del
+    total_sub += wer_object_matrix[row][col].num_sub
+    total_wc += wer_object_matrix[row][col].wc
+total_error = total_ins+total_del+total_sub
+wer = float(100*total_error)/total_wc
+
+# Write the final statistics to stdout
+print ("%WER {:.2f} [ {} / {}, {} ins, {} del, {} sub ]".format(wer, total_error, total_wc,
+    total_ins, total_del, total_sub))
diff --git a/egs/libri_css/s5_mono/local/chain/run_chain_common.sh b/egs/libri_css/s5_mono/local/chain/run_chain_common.sh
new file mode 100755
index 00000000000..fddda061e19
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/chain/run_chain_common.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+
+# this script has common stages shared across librispeech chain recipes.
+# It generates a new topology in a new lang directory, gets the alignments as
+# lattices, and builds a tree for the new topology
+set -e
+
+stage=11
+
+# input directory names. These options are actually compulsory, and they have
+# been named for convenience
+gmm_dir=
+ali_dir=
+lores_train_data_dir=
+
+num_leaves=6000
+
+# output directory names. They are also compulsory.
+lang=
+lat_dir=
+tree_dir=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1;
+[ -z $lat_dir ] && echo "Set --lat-dir, this specifies the experiment directory to store lattice" && exit 1;
+[ -z $tree_dir ] && echo "Set --tree-dir, this specifies the directory to store new tree " && exit 1;
+
+for f in $gmm_dir/final.mdl $ali_dir/ali.1.gz $lores_train_data_dir/feats.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl)
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl)
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 12 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat ${ali_dir}/num_jobs)
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \
+    $lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology. We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} $lang $ali_dir $tree_dir
+fi
diff --git a/egs/libri_css/s5_mono/local/chain/run_tdnn.sh b/egs/libri_css/s5_mono/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..e1adaa9346d
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1d.sh
\ No newline at end of file
diff --git a/egs/libri_css/s5_mono/local/chain/tuning/run_tdnn_1d.sh b/egs/libri_css/s5_mono/local/chain/tuning/run_tdnn_1d.sh
new file mode 100755
index 00000000000..8e0a9f415a0
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/chain/tuning/run_tdnn_1d.sh
@@ -0,0 +1,171 @@
+#!/usr/bin/env bash
+set -e
+
+# This training script is taken directly from Librispeech tdnn_1d. We
+# remove the decode stages since we don't need them for this recipe.
+
+# configs for 'chain'
+stage=0
+decode_nj=50
+train_set=train_960_cleaned
+gmm=tri6b_cleaned
+nnet3_affix=_cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1d
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# TDNN options
+frames_per_eg=150,110,100
+remove_egs=true
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm 6 --num-processes 3 \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix:+_$affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+# Please take this as a reference on how to specify all the options of
+# local/chain/run_chain_common.sh
+local/chain/run_chain_common.sh --stage $stage \
+                                --gmm-dir $gmm_dir \
+                                --ali-dir $ali_dir \
+                                --lores-train-data-dir ${lores_train_data_dir} \
+                                --lang $lang \
+                                --lat-dir $lat_dir \
+                                --num-leaves 7000 \
+                                --tree-dir $tree_dir || exit 1;
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  affine_opts="l2-regularize=0.008 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.75"
+  linear_opts="l2-regularize=0.008 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.008"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf16 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf17 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b{09,10,11,12}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 2500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00015 \
+    --trainer.optimization.final-effective-lrate 0.000015 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir  || exit 1;
+
+fi
+
+exit 0;
diff --git a/egs/libri_css/s5_mono/local/chain/tuning/run_tdnn_1d_ft.sh b/egs/libri_css/s5_mono/local/chain/tuning/run_tdnn_1d_ft.sh
new file mode 100755
index 00000000000..d965194d98a
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/chain/tuning/run_tdnn_1d_ft.sh
@@ -0,0 +1,241 @@
+#!/usr/bin/env bash
+set -e
+
+# This script fine tunes a pretrained model on additional data
+# which is reverberant, like LibriCSS. We only fine tune for
+# 1 epoch.
+
+# configs for 'chain'
+stage=0
+nj=40
+decode_nj=50
+train_set=train_960_cleaned
+gmm=tri6b_cleaned
+nnet3_affix=_cleaned
+
+# Pretrained models for AM and i-vector extractor
+src_model_dir=../s5_css/exp/chain$nnet3_affix/tdnn_1d2_sp
+ivector_extractor=exp/nnet3$nnet3_affix/extractor
+primary_lr_factor=0.1 # The learning-rate factor for transferred layers from source
+                       # model. e.g. if 0, the paramters transferred from source model
+                       # are fixed.
+                       # The learning-rate factor for new added layers is 1.0.
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1d2_ft
+tree_affix=reverb
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# TDNN options
+frames_per_eg=150,110,100
+remove_egs=true
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_chain
+ali_dir=exp/${gmm}_ali_${train_set}_reverb
+tree_dir=exp/chain${nnet3_affix}/tree${tree_affix:+_$tree_affix}
+lat_dir=exp/chain${nnet3_affix}/chain_${train_set}_reverb_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix:+_$affix}
+train_data_dir=data/${train_set}_reverb_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_reverb_hires
+
+if [ $stage -le 1 ]; then
+  # Adding simulated RIRs to the original data directory
+  echo "$0: Preparing data/${train_set}_reverb directory"
+
+  if [ ! -d "RIRS_NOISES" ]; then
+    # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+    unzip rirs_noises.zip
+  fi
+
+  if [ ! -f data/$train_set/reco2dur ]; then
+    utils/data/get_reco2dur.sh --nj $nj --cmd "$train_cmd" data/$train_set || exit 1;
+  fi
+
+  # Make a version with reverberated speech
+  rvb_opts=()
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
+
+  # Make a reverberated version of train-960.
+  # Note that we don't add any additive noise here.
+  steps/data/reverberate_data_dir.py \
+    "${rvb_opts[@]}" \
+    --speech-rvb-probability 1 \
+    --prefix "reverb" \
+    --pointsource-noise-addition-probability 0 \
+    --isotropic-noise-addition-probability 0 \
+    --num-replications 1 \
+    --source-sampling-rate 16000 \
+    data/$train_set data/${train_set}_reverb
+fi
+
+if [ $stage -le 2 ]; then 
+  # Feature extraction for reverberated data.
+  mfccdir=mfcc_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    date=$(date +'%m_%d_%H_%M')
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/mfcc/librispeech-$date/s5c/$mfccdir/storage $mfccdir/storage
+  fi
+
+  # First we extract 13-dim MFCCs which will be used to obtain training lattices
+  echo "$0: Creating MFCCs for dir data/${train_set}_reverb"
+  utils/copy_data_dir.sh data/${train_set}_reverb data/${train_set}_reverb_hires
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_reverb_hires
+
+  steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${train_set}_reverb_hires exp/make_hires/${train_set}_reverb $mfccdir;
+  steps/compute_cmvn_stats.sh data/${train_set}_reverb_hires exp/make_hires/${train_set}_reverb $mfccdir;
+
+  # Now we extract hires MFCCs which will be used for acoustic model training
+  echo "$0: Creating hi resolution MFCCs for dir data/${train_set}_reverb"
+  utils/copy_data_dir.sh data/${train_set}_reverb data/${train_set}_reverb_hires
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_reverb_hires
+
+  steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${train_set}_reverb_hires exp/make_hires/${train_set}_reverb $mfccdir;
+  steps/compute_cmvn_stats.sh data/${train_set}_reverb_hires exp/make_hires/${train_set}_reverb $mfccdir;
+
+  # Remove the small number of utterances that couldn't be extracted for some
+  # reason (e.g. too short; no such file).
+  utils/fix_data_dir.sh data/${train_set}_reverb_hires;
+fi
+
+if [ $stage -le 3 ]; then
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
+    data/${train_set}_reverb_hires $ivector_extractor \
+    $train_ivector_dir || exit 1;
+fi
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 4 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning with the low-resolution data"
+  steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
+    data/${train_set}_reverb data/lang exp/$gmm $ali_dir || exit 1
+fi
+
+if [ $stage -le 5 ]; then
+  # Build a tree using our new topology. We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 6000 data/${train_set}_reverb $lang ${ali_dir} $tree_dir
+fi
+
+# Now we generate training lattices
+if [ $stage -le 6 ]; then
+  nj=$(cat ${ali_dir}/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/${train_set}_reverb \
+    data/lang exp/$gmm ${lat_dir}
+  rm ${lat_dir}/fsts.*.gz # save space
+fi
+
+# We remove output layer of trained model since the new one has different number
+# of leaves.
+if [ $stage -le 7 ]; then
+  echo "$0: Create neural net configs using the xconfig parser for";
+  echo " generating new layers, that are specific to LibriCSS. These layers ";
+  echo " are added to the transferred part of the Librispeech network.";
+  num_targets=$(tree-info --print-args=false $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  tdnnf_opts="l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.75"
+  linear_opts="l2-regularize=0.008 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.008"
+  output_opts="l2-regularize=0.002"
+  mkdir -p $dir
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  tdnnf-layer name=tdnnf18 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 input=tdnnf17.batchnorm
+  ## adding the layers for chain branch
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --existing-model $src_model_dir/final.mdl \
+    --xconfig-file  $dir/configs/network.xconfig  \
+    --config-dir $dir/configs/
+
+  # Set the learning-rate-factor to be primary_lr_factor for transferred layers "
+  # and adding new layers to them.
+  $train_cmd $dir/log/generate_input_mdl.log \
+    nnet3-copy --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor" $src_model_dir/final.mdl - \| \
+      nnet3-init --srand=1 - $dir/configs/final.config $dir/input.raw  || exit 1;
+fi
+
+if [ $stage -le 8 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b{09,10,11,12}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --trainer.input-model $dir/input.raw \
+    --feat.cmvn-opts "--norm-means=true --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 2500000 \
+    --trainer.num-epochs 1 \
+    --trainer.optimization.num-jobs-initial 6 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.0001 \
+    --trainer.optimization.final-effective-lrate 0.00001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --use-gpu=wait \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir  || exit 1;
+
+fi
+
+exit 0;
diff --git a/egs/libri_css/s5_mono/local/convert_rttm_to_utt2spk_and_segments.py b/egs/libri_css/s5_mono/local/convert_rttm_to_utt2spk_and_segments.py
new file mode 100755
index 00000000000..247aba67b46
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/convert_rttm_to_utt2spk_and_segments.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+# Copyright   2019   Vimal Manohar
+# Apache 2.0.
+
+"""This script converts an RTTM with
+speaker info into kaldi utt2spk and segments"""
+
+import argparse
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="""This script converts an RTTM with
+        speaker info into kaldi utt2spk and segments""")
+    parser.add_argument("--use-reco-id-as-spkr", type=str,
+                        choices=["true", "false"], default="false",
+                        help="Use the recording ID based on RTTM and "
+                        "reco2file_and_channel as the speaker")
+    parser.add_argument("--append-reco-id-to-spkr", type=str,
+                        choices=["true", "false"], default="false",
+                        help="Append recording ID to the speaker ID")
+
+    parser.add_argument("rttm_file", type=str,
+                        help="""Input RTTM file.
+                        The format of the RTTM file is
+                        <type> <file-id> <channel-id> <begin-time> """
+                        """<end-time> <NA> <NA> <speaker> <conf>""")
+    parser.add_argument("reco2file_and_channel", type=str,
+                        help="""Input reco2file_and_channel.
+                        The format is <recording-id> <file-id> <channel-id>.""")
+    parser.add_argument("utt2spk", type=str,
+                        help="Output utt2spk file")
+    parser.add_argument("segments", type=str,
+                        help="Output segments file")
+
+    args = parser.parse_args()
+
+    args.use_reco_id_as_spkr = bool(args.use_reco_id_as_spkr == "true")
+    args.append_reco_id_to_spkr = bool(args.append_reco_id_to_spkr == "true")
+
+    if args.use_reco_id_as_spkr:
+        if args.append_reco_id_to_spkr:
+            raise Exception("Appending recording ID to speaker does not make sense when using --use-reco-id-as-spkr=true")
+
+    return args
+
+def main():
+    args = get_args()
+
+    file_and_channel2reco = {}
+    utt2spk={}
+    segments={}
+    for line in open(args.reco2file_and_channel):
+        parts = line.strip().split()
+        file_and_channel2reco[(parts[1], parts[2])] = parts[0]
+
+    utt2spk_writer = open(args.utt2spk, 'w')
+    segments_writer = open(args.segments, 'w')
+    for line in open(args.rttm_file):
+        parts = line.strip().split()
+        if parts[0] != "SPEAKER":
+            continue
+
+        file_id = parts[1]
+        channel = parts[2]
+
+        try:
+            reco = file_and_channel2reco[(file_id, channel)]
+        except KeyError as e:
+            raise Exception("Could not find recording with "
+                            "(file_id, channel) "
+                            "= ({0},{1}) in {2}: {3}\n".format(
+                                file_id, channel,
+                                args.reco2file_and_channel, str(e)))
+
+        start_time = float(parts[3])
+        end_time = start_time + float(parts[4])
+
+        if args.use_reco_id_as_spkr:
+            spkr = reco
+        else:
+            if args.append_reco_id_to_spkr:
+                spkr = parts[7] + "_" + reco
+            else:
+                spkr = parts[7]
+
+        st = int(start_time * 100)
+        end = int(end_time * 100)
+        utt = "{0}_{1:06d}_{2:06d}".format(spkr, st, end)
+        utt2spk[utt]=spkr
+        segments[utt]=(reco, start_time, end_time)
+
+    for uttid_id in sorted(utt2spk):
+        utt2spk_writer.write("{0} {1}\n".format(uttid_id, utt2spk[uttid_id]))
+        segments_writer.write("{0} {1} {2:7.2f} {3:7.2f}\n".format(
+            uttid_id, segments[uttid_id][0], segments[uttid_id][1], segments[uttid_id][2]))
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/libri_css/s5_mono/local/data_prep_css.sh b/egs/libri_css/s5_mono/local/data_prep_css.sh
new file mode 100755
index 00000000000..5029b05e9af
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/data_prep_css.sh
@@ -0,0 +1,94 @@
+#!/usr/bin/env bash
+#
+# Copyright  2020  Johns Hopkins University (Author: Desh Raj)
+# Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+data_affix=
+volume=1
+
+. ./utils/parse_options.sh  # accept options
+
+. ./path.sh
+
+echo >&2 "$0" "$@"
+if [ $# -ne 2 ] ; then
+  echo >&2 "$0" "$@"
+  echo >&2 "$0: Error: wrong number of arguments"
+  echo -e >&2 "Usage:\n  $0 [opts] <corpus-dir> <wav-dile-dir>"
+  echo -e >&2 "eg:\n  $0 /export/corpora/LibriCSS /export/c01/zhuc/css_data"
+  exit 1
+fi
+
+corpus_dir=$1
+wav_files_dir=$2
+
+set -e -o pipefail
+
+# If data is not already present, then download and unzip
+if [ ! -d $corpus_dir/for_release ]; then
+    echo "Downloading and unpacking LibriCSS data."    
+    CWD=`pwd`
+    mkdir -p $corpus_dir
+
+    cd $corpus_dir
+
+    # Download the data. If the data has already been downloaded, it
+    # does nothing. (See wget -c) 
+    wget -c --load-cookies /tmp/cookies.txt \
+      "https://docs.google.com/uc?export=download&confirm=$(wget --quiet \
+      --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate \
+      'https://docs.google.com/uc?export=download&id=1Piioxd5G_85K9Bhcr8ebdhXx0CnaHy7l' \
+      -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1Piioxd5G_85K9Bhcr8ebdhXx0CnaHy7l" \
+      -O for_release.zip && rm -rf /tmp/cookies.txt
+
+    # unzip (skip if already extracted)
+    unzip -n for_release.zip
+
+    # segmentation
+    cd for_release
+    python3 segment_libricss.py -data_path .
+
+    cd $CWD
+fi
+
+# Process the downloaded data directory to get data in Kaldi format
+# We first copy all the separated wav files from the original location
+# without any directory structure. Here, the wav naming convention is 
+# similar to that in the LibriCSS corpus meeting directories, with an
+# additional `channel_n` at the end denoting the stream number, e.g.
+# overlap_ratio_10.0_sil0.1_1.0_session0_actual10.1_channel_0.wav
+# note that this "channel" is actually one of the separated audio streams.
+mkdir -p data/local/data${data_affix}/wavs_orig
+find $wav_files_dir -name '*.wav' -exec cp {} data/local/data${data_affix}/wavs_orig \;
+local/prepare_data_css.py --srcpath $corpus_dir/for_release --wav-path data/local/data${data_affix}/wavs_orig \
+  --tgtpath data/local/data${data_affix} --volume $volume
+
+# Create dev and eval splits based on sessions. In total we have 10 sessions (session0 to 
+# session9) of approximately 1 hour each.
+dev_sessions="session0"
+eval_sessions="session[1-9]"
+
+mkdir -p data/dev${data_affix}
+for file in wav.scp utt2spk text segments; do
+  grep $dev_sessions data/local/data${data_affix}/$file | sort > data/dev${data_affix}/$file 
+done
+
+mkdir -p data/eval${data_affix}
+for file in wav.scp utt2spk text segments; do
+  grep $eval_sessions data/local/data${data_affix}/$file | sort > data/eval${data_affix}/$file 
+done
+
+# Move the utt2spk, segments, and text file to .bak so that they are only used
+# in the last scoring stage. We also prepare a dummy utt2spk and spk2utt for
+# these.
+for datadir in dev eval; do
+  for file in text utt2spk segments; do
+    mv data/$datadir${data_affix}/$file data/$datadir${data_affix}/$file.bak
+  done
+
+  awk '{print $1, $1}' data/$datadir${data_affix}/wav.scp > data/$datadir${data_affix}/utt2spk
+  utils/utt2spk_to_spk2utt.pl data/$datadir${data_affix}/utt2spk > data/$datadir${data_affix}/spk2utt
+
+done
diff --git a/egs/libri_css/s5_mono/local/data_prep_librispeech.sh b/egs/libri_css/s5_mono/local/data_prep_librispeech.sh
new file mode 100755
index 00000000000..cbdd147b2df
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/data_prep_librispeech.sh
@@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+
+# Copyright 2014  Vassil Panayotov
+#           2014  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 <src-dir> <dst-dir>"
+  echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean"
+  exit 1
+fi
+
+src=$1
+dst=$2
+
+spk_file=$src/../SPEAKERS.TXT
+
+mkdir -p $dst || exit 1;
+
+[ ! -d $src ] && echo "$0: no such directory $src" && exit 1;
+[ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1;
+
+
+wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
+trans=$dst/text; [[ -f "$trans" ]] && rm $trans
+utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk
+spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender
+
+for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
+  reader=$(basename $reader_dir)
+  if ! [ $reader -eq $reader ]; then  # not integer.
+    echo "$0: unexpected subdirectory name $reader"
+    exit 1;
+  fi
+
+  reader_gender=$(egrep "^$reader[ ]+\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($2)}')
+  if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then
+    echo "Unexpected gender: '$reader_gender'"
+    exit 1;
+  fi
+
+  for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
+    chapter=$(basename $chapter_dir)
+    if ! [ "$chapter" -eq "$chapter" ]; then
+      echo "$0: unexpected chapter-subdirectory name $chapter"
+      exit 1;
+    fi
+
+    find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
+      awk -v "dir=$chapter_dir" '{printf "%s sox %s/%s.flac -t wav - |\n", $0, dir, $0}' >>$wav_scp|| exit 1
+
+    chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
+    [ ! -f  $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
+    cat $chapter_trans >>$trans
+
+    # NOTE: For now we are using per-chapter utt2spk. That is each chapter is considered
+    #       to be a different speaker. This is done for simplicity and because we want
+    #       e.g. the CMVN to be calculated per-chapter
+    awk -v "reader=$reader" -v "chapter=$chapter" '{printf "%s %s-%s\n", $1, reader, chapter}' \
+      <$chapter_trans >>$utt2spk || exit 1
+
+    # reader -> gender map (again using per-chapter granularity)
+    echo "${reader}-${chapter} $reader_gender" >>$spk2gender
+  done
+done
+
+spk2utt=$dst/spk2utt
+utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1
+
+ntrans=$(wc -l <$trans)
+nutt2spk=$(wc -l <$utt2spk)
+! [ "$ntrans" -eq "$nutt2spk" ] && \
+  echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1;
+
+utils/validate_data_dir.sh --no-feats $dst || exit 1;
+
+echo "$0: successfully prepared data in $dst"
+
+exit 0
diff --git a/egs/libri_css/s5_mono/local/data_prep_mono.sh b/egs/libri_css/s5_mono/local/data_prep_mono.sh
new file mode 100755
index 00000000000..75f661f79bb
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/data_prep_mono.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+#
+# Copyright  2020  Johns Hopkins University (Author: Desh Raj)
+# Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+data_affix=
+
+. ./utils/parse_options.sh  # accept options
+
+. ./path.sh
+
+echo >&2 "$0" "$@"
+if [ $# -ne 2 ] ; then
+  echo >&2 "$0" "$@"
+  echo >&2 "$0: Error: wrong number of arguments"
+  echo -e >&2 "Usage:\n  $0 [opts] <corpus-dir> <librispeech-dir>"
+  echo -e >&2 "eg:\n  $0 /export/corpora/LibriCSS /export/corpora/LibriSpeech"
+  exit 1
+fi
+
+corpus_dir=$1
+librispeech_dir=$2
+
+set -e -o pipefail
+
+# If data is not already present, then download and unzip
+if [ ! -d $corpus_dir/for_release ]; then
+    echo "Downloading and unpacking LibriCSS data."    
+    CWD=`pwd`
+    mkdir -p $corpus_dir
+
+    cd $corpus_dir
+
+    # Download the data. If the data has already been downloaded, it
+    # does nothing. (See wget -c) 
+    wget -c --load-cookies /tmp/cookies.txt \
+      "https://docs.google.com/uc?export=download&confirm=$(wget --quiet \
+      --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate \
+      'https://docs.google.com/uc?export=download&id=1Piioxd5G_85K9Bhcr8ebdhXx0CnaHy7l' \
+      -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1Piioxd5G_85K9Bhcr8ebdhXx0CnaHy7l" \
+      -O for_release.zip && rm -rf /tmp/cookies.txt
+
+    # unzip (skip if already extracted)
+    unzip -n for_release.zip
+
+    # segmentation
+    cd for_release
+    python3 segment_libricss.py -data_path .
+
+    cd $CWD
+fi
+
+# Process the downloaded data directory to get data in Kaldi format
+if ! [ -d data/local/data${data_affix} ]; then
+  mkdir -p data/local/data${data_affix}/
+  local/prepare_data.py --srcpath $corpus_dir/for_release --tgtpath data/local/data${data_affix} --mics 0 \
+    --cleanpath $librispeech_dir
+fi
+
+# Create dev and eval splits based on sessions. In total we have 10 sessions (session0 to 
+# session9) of approximately 1 hour each. In the below strings, separate each session by
+# '\|' to perform grep at once.
+dev_sessions="session0"
+eval_sessions="session[1-9]"
+
+mkdir -p data/dev${data_affix}
+for file in wav.scp utt2spk text segments wav_clean.scp; do
+  grep $dev_sessions data/local/data${data_affix}/$file | sort > data/dev${data_affix}/$file 
+done
+
+mkdir -p data/eval${data_affix}
+for file in wav.scp utt2spk text segments wav_clean.scp; do
+  grep $eval_sessions data/local/data${data_affix}/$file | sort > data/eval${data_affix}/$file 
+done
+
+# Move the utt2spk, segments, and text file to .bak so that they are only used
+# in the last scoring stage. We also prepare a dummy utt2spk and spk2utt for
+# these.
+for datadir in dev eval; do
+  for file in text utt2spk segments; do
+    mv data/$datadir${data_affix}/$file data/$datadir${data_affix}/$file.bak
+  done
+
+  awk '{print $1, $1}' data/$datadir${data_affix}/wav.scp > data/$datadir${data_affix}/utt2spk
+  utils/utt2spk_to_spk2utt.pl data/$datadir${data_affix}/utt2spk > data/$datadir${data_affix}/spk2utt
+
+done
diff --git a/egs/libri_css/s5_mono/local/decode.sh b/egs/libri_css/s5_mono/local/decode.sh
new file mode 100755
index 00000000000..620d0319927
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/decode.sh
@@ -0,0 +1,227 @@
+#!/usr/bin/env bash
+#
+# This script decodes raw utterances through the entire pipeline:
+# VAD -> Feature extraction -> Diarization -> ASR
+#
+# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal)
+#            2019  Desh Raj, David Snyder, Ashish Arora, Zhaoheng Ni
+# Apache 2.0
+
+# Begin configuration section.
+nj=8
+stage=0
+
+diarizer_stage=1
+decode_diarize_stage=0
+decode_oracle_stage=0
+score_stage=0
+nnet3_affix=_cleaned # affix for the chain directory name
+affix=1d_ft   # affix for the TDNN directory name
+
+# If the following is set to true, we use the oracle speaker and segment
+# information instead of performing SAD and diarization.
+use_oracle_segments=
+sad_type=webrtc # Set this to webrtc or tdnn
+rnnlm_rescore=true
+
+# RNNLM rescore options
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+pruned_rescore=true
+rnnlm_dir=exp/rnnlm_lstm_1a
+
+test_sets="dev eval"
+
+. ./utils/parse_options.sh
+
+. ./cmd.sh
+. ./path.sh
+
+# Get dev and eval set names from the test_sets
+dev_set=$( echo $test_sets | cut -d " " -f1 )
+eval_set=$( echo $test_sets | cut -d " " -f2 )
+
+$use_oracle_segments && [ $stage -le 8 ] && stage=8
+
+#######################################################################
+# Perform SAD on the dev/eval data using py-webrtcvad package
+#######################################################################
+if [ $stage -le 1 ]; then
+  for datadir in ${test_sets}; do
+    test_set=data/${datadir}
+    if [ $sad_type == "webrtc" ]; then
+      echo "Applying WebRTC-VAD on ${datadir}"
+      local/segmentation/apply_webrtcvad.py --mode 0 $test_set | sort > $test_set/segments
+    else
+      echo "Applying TDNN-Stats-SAD on ${datadir}"
+      if [ ! -f ${test_set}/wav.scp ]; then
+        echo "$0: Not performing SAD on ${test_set}, since wav.scp does not exist. Exiting!"
+        exit 0
+      fi
+
+      sad_nj=$(wc -l < "$test_set/wav.scp")
+      nj=$((decode_nj>sad_nj ? sad_nj : decode_nj))
+      # Perform segmentation. We use the pretrained CHiME-6 SAD available at:
+      # http://kaldi-asr.org/models/12/0012_sad_v1.tar.gz
+      # Download and extract using tar -xvzf
+      if [ ! -d exp/segmentation_1a/tdnn_stats_sad_1a ]; then
+        wget http://kaldi-asr.org/models/12/0012_sad_v1.tar.gz || exit
+        tar -xvzf 0012_sad_v1.tar.gz
+        cp -r 0012_sad_v1/conf/* conf/
+        cp -r 0012_sad_v1/exp/segmentation_1a exp/
+      fi
+      local/detect_speech_activity.sh --cmd "$decode_cmd" --nj $sad_nj $test_set \
+        exp/segmentation_1a/tdnn_stats_sad_1a
+    fi
+
+    # Create dummy utt2spk file from obtained segments
+    awk '{print $1, $2}' ${test_set}/segments > ${test_set}/utt2spk
+    utils/utt2spk_to_spk2utt.pl ${test_set}/utt2spk > ${test_set}/spk2utt
+    
+    steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \
+      ${test_set}/utt2spk ${test_set}/segments ${test_set}/rttm
+
+    echo "Scoring $datadir.."
+    # We first generate the reference RTTM from the backed up utt2spk and segments
+    # files.
+    ref_rttm=${test_set}/ref_rttm
+    steps/segmentation/convert_utt2spk_and_segments_to_rttm.py ${test_set}/utt2spk.bak \
+      ${test_set}/segments.bak ${test_set}/ref_rttm
+
+    md-eval.pl -r $ref_rttm -s ${test_set}/rttm |\
+      awk '/(MISSED|FALARM) SPEECH/'
+    
+  done
+fi
+
+#######################################################################
+# Feature extraction for the dev and eval data
+#######################################################################
+if [ $stage -le 2 ]; then
+  # mfccdir should be some place with a largish disk where you
+  # want to store MFCC features.
+  mfccdir=mfcc
+  for x in ${test_sets}; do
+    steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \
+      --mfcc-config conf/mfcc_hires.conf \
+      data/$x exp/make_mfcc/$x $mfccdir
+  done
+fi
+
+#######################################################################
+# Perform diarization on the dev/eval data
+#######################################################################
+if [ $stage -le 3 ]; then
+  for datadir in ${test_sets}; do
+    ref_rttm=data/${datadir}/ref_rttm
+    steps/segmentation/convert_utt2spk_and_segments_to_rttm.py data/${datadir}/utt2spk.bak \
+      data/${datadir}/segments.bak $ref_rttm
+    diar_nj=$(wc -l < "data/$datadir/wav.scp") # This is important especially for VB-HMM
+
+    [ ! -d exp/xvector_nnet_1a ] && ./local/download_diarizer.sh
+
+    local/diarize_spectral.sh --nj $diar_nj --cmd "$train_cmd" --stage $diarizer_stage \
+      --ref-rttm $ref_rttm \
+      exp/xvector_nnet_1a \
+      data/${datadir} \
+      exp/${datadir}_diarization
+  done
+fi
+
+#######################################################################
+# Decode diarized output using trained chain model
+#######################################################################
+if [ $stage -le 4 ]; then
+  for datadir in ${test_sets}; do
+    asr_nj=$(wc -l < "data/$datadir/wav.scp")
+    local/decode_diarized.sh --nj $asr_nj --cmd "$decode_cmd" --stage $decode_diarize_stage \
+      --lm-suffix "_tgsmall" \
+      data/${datadir}/rttm_tsvad data/$datadir data/lang_test_tgsmall \
+      exp/chain${nnet3_affix}/tdnn_${affix} exp/nnet3${nnet3_affix} \
+      data/${datadir}_diarized || exit 1
+  done
+fi
+
+#######################################################################
+# Score decoded dev/eval sets
+#######################################################################
+if [ $stage -le 5 ]; then
+  # please specify both dev and eval set directories so that the search parameters
+  # (insertion penalty and language model weight) will be tuned using the dev set
+  local/score_reco_diarized.sh --cmd "$train_cmd" --stage $score_stage \
+      --dev_decodedir exp/chain${nnet3_affix}/tdnn_${affix}_sp/decode_${dev_set}_diarized_2stage \
+      --dev_datadir ${dev_set}_diarized_hires \
+      --eval_decodedir exp/chain${nnet3_affix}/tdnn_${affix}_sp/decode_${eval_set}_diarized_2stage \
+      --eval_datadir ${eval_set}_diarized_hires
+fi
+
+############################################################################
+# RNNLM rescoring
+############################################################################
+if $rnnlm_rescore; then
+  if [ $stage -le 6 ]; then
+    echo "$0: Perform RNNLM lattice-rescoring"
+    pruned=
+    ac_model_dir=exp/chain${nnet3_affix}/tdnn_${affix}
+    if $pruned_rescore; then
+      pruned=_pruned
+    fi
+    for decode_set in $test_sets; do
+      decode_dir=${ac_model_dir}/decode_${decode_set}_diarized_2stage
+      # Lattice rescoring
+      rnnlm/lmrescore${pruned}.sh \
+          --cmd "$decode_cmd --mem 8G" \
+          --weight 0.45 --max-ngram-order $ngram_order \
+          data/lang_test_tgsmall $rnnlm_dir \
+          data/${decode_set}_diarized_hires ${decode_dir} \
+          ${ac_model_dir}/decode_${decode_set}_diarized_2stage_rescore
+    done
+  fi
+
+  if [ $stage -le 7 ]; then
+    echo "$0: WERs after rescoring with $rnnlm_dir"
+    local/score_reco_diarized.sh --cmd "$train_cmd" --stage $score_stage \
+        --dev_decodedir exp/chain${nnet3_affix}/tdnn_${affix}/decode_${dev_set}_diarized_2stage_rescore \
+        --dev_datadir ${dev_set}_diarized_hires \
+        --eval_decodedir exp/chain${nnet3_affix}/tdnn_${affix}/decode_${eval_set}_diarized_2stage_rescore \
+        --eval_datadir ${eval_set}_diarized_hires
+  fi
+fi
+
+$use_oracle_segments || exit 0
+
+######################################################################
+# Here we decode using oracle speaker and segment information
+######################################################################
+if [ $stage -le 8 ]; then
+  # mfccdir should be some place with a largish disk where you
+  # want to store MFCC features.
+  mfccdir=mfcc
+  for x in ${test_sets}; do
+    datadir=data/${x}_oracle
+    mkdir -p $datadir
+    
+    cp data/$x/wav.scp $datadir/
+    cp data/$x/segments.bak $datadir/segments
+    cp data/$x/utt2spk.bak $datadir/utt2spk
+    cp data/$x/text.bak $datadir/text
+    utils/utt2spk_to_spk2utt.pl $datadir/utt2spk > $datadir/spk2utt
+
+    steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \
+      --mfcc-config conf/mfcc_hires.conf \
+      $datadir exp/make_mfcc/$x $mfccdir
+  done
+fi
+
+if [ $stage -le 9 ]; then
+  local/decode_oracle.sh --stage $decode_oracle_stage \
+    --affix $affix \
+    --lang-dir data/lang_test_tgsmall \
+    --lm-suffix "_tgsmall" \
+    --rnnlm-rescore $rnnlm_rescore \
+    --test_sets "$test_sets"
+fi
+
+exit 0;
diff --git a/egs/libri_css/s5_mono/local/decode_diarized.sh b/egs/libri_css/s5_mono/local/decode_diarized.sh
new file mode 100755
index 00000000000..a8a858bc1ed
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/decode_diarized.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+# Copyright   2019   Ashish Arora, Vimal Manohar
+# Apache 2.0.
+# This script takes an rttm file, and performs decoding on on a test directory.
+# The output directory contains a text file which can be used for scoring.
+
+
+stage=0
+nj=8
+cmd=run.pl
+lm_suffix=
+
+echo "$0 $@"  # Print the command line for logging
+
+. ./path.sh
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 6 ]; then
+  echo "Usage: $0 <rttm> <in-data-dir> <lang-dir> <model-dir> <ivector-dir> <out-dir>"
+  echo "e.g.: $0 data/rttm data/dev data/lang_chain exp/chain/tdnn_1a \
+                 exp/nnet3_cleaned data/dev_diarized"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs."
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  exit 1;
+fi
+
+rttm=$1
+data_in=$2
+lang_dir=$3
+asr_model_dir=$4
+ivector_extractor=$5
+out_dir=$6
+
+for f in $rttm $data_in/wav.scp $data_in/text.bak \
+         $lang_dir/L.fst $asr_model_dir/graph${lm_suffix}/HCLG.fst \
+         $asr_model_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+if [ $stage -le 0 ]; then
+  echo "$0 copying data files in output directory"
+  mkdir -p ${out_dir}_hires
+  cp ${data_in}/{wav.scp,utt2spk,utt2spk.bak} ${out_dir}_hires
+  utils/data/get_reco2dur.sh ${out_dir}_hires
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0 creating segments file from rttm and utt2spk, reco2file_and_channel "
+  local/convert_rttm_to_utt2spk_and_segments.py --append-reco-id-to-spkr=true $rttm \
+    <(awk '{print $2" "$2" "$3}' $rttm | sort -u) \
+    ${out_dir}_hires/utt2spk ${out_dir}_hires/segments
+
+  utils/utt2spk_to_spk2utt.pl ${out_dir}_hires/utt2spk > ${out_dir}_hires/spk2utt
+  utils/fix_data_dir.sh ${out_dir}_hires || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0 extracting mfcc freatures using segments file"
+  steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd queue.pl ${out_dir}_hires
+  steps/compute_cmvn_stats.sh ${out_dir}_hires
+  utils/fix_data_dir.sh ${out_dir}_hires || exit 1;
+  cp $data_in/text.bak ${out_dir}_hires/text
+fi
+
+if [ $stage -le 3 ]; then
+  utils/mkgraph.sh \
+      --self-loop-scale 1.0 --remove-oov $lang_dir \
+      $asr_model_dir $asr_model_dir/graph${lm_suffix}
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0 performing decoding on the extracted features"
+  local/nnet3/decode.sh --affix 2stage --acwt 1.0 --post-decode-acwt 10.0 \
+    --frames-per-chunk 150 --nj $nj --ivector-dir $ivector_extractor \
+    $out_dir $lang_dir $asr_model_dir/graph${lm_suffix} $asr_model_dir/
+fi
+
diff --git a/egs/libri_css/s5_mono/local/decode_diarized_css.sh b/egs/libri_css/s5_mono/local/decode_diarized_css.sh
new file mode 100755
index 00000000000..995901a935d
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/decode_diarized_css.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+# Copyright   2019   Ashish Arora, Vimal Manohar, Desh Raj
+# Apache 2.0.
+# This script is similar to the decode_diarized.sh script, except that is
+# works on CSS separated audio streams. The key difference here is in how
+# we create segments for feature extraction, since now they will have to
+# come from the respective streams.
+
+stage=0
+nj=8
+cmd=run.pl
+lm_suffix=
+acwt=1.0
+post_decode_acwt=10.0
+
+echo "$0 $@"  # Print the command line for logging
+
+. ./path.sh
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 6 ]; then
+  echo "Usage: $0 <rttm> <in-data-dir> <lang-dir> <model-dir> <ivector-dir> <out-dir>"
+  echo "e.g.: $0 data/rttm data/dev data/lang_chain exp/chain/tdnn_1a \
+                 exp/nnet3_cleaned data/dev_diarized"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs."
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  exit 1;
+fi
+
+rttm=$1
+data_in=$2
+lang_dir=$3
+asr_model_dir=$4
+ivector_extractor=$5
+out_dir=$6
+
+for f in $rttm $data_in/wav.scp $data_in/text.bak \
+         $lang_dir/L.fst $asr_model_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+if [ $stage -le 0 ]; then
+  echo "$0 copying data files in output directory"
+  mkdir -p ${out_dir}_hires
+  cp ${data_in}/{wav.scp,utt2spk.bak} ${out_dir}_hires
+  utils/data/get_reco2dur.sh ${out_dir}_hires
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0 creating segments file from rttm and utt2spk, reco2file_and_channel "
+  local/convert_rttm_to_utt2spk_and_segments.py --append-reco-id-to-spkr=true $rttm \
+    <(awk '{print $2" "$2" "$3}' $rttm | sort -u) \
+    ${out_dir}_hires/utt2spk.reco ${out_dir}_hires/segments
+
+  # We remove the stream id from the spk id (for speaker-level CMN)
+  awk '{$2=$2;sub(/_[0-9]*$/, "", $2); print}' ${out_dir}_hires/utt2spk.reco \
+    > ${out_dir}_hires/utt2spk
+
+  utils/utt2spk_to_spk2utt.pl ${out_dir}_hires/utt2spk > ${out_dir}_hires/spk2utt
+  utils/fix_data_dir.sh ${out_dir}_hires
+fi
+
+if [ $stage -le 2 ]; then
+  # Now we extract features
+  steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd "$cmd" ${out_dir}_hires
+  steps/compute_cmvn_stats.sh ${out_dir}_hires
+  utils/fix_data_dir.sh ${out_dir}_hires || exit 1;
+  cp $data_in/text.bak ${out_dir}_hires/text
+fi
+
+if [ $stage -le 3 ]; then
+  utils/mkgraph.sh \
+      --self-loop-scale 1.0 $lang_dir \
+      $asr_model_dir $asr_model_dir/graph${lm_suffix}
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0 performing decoding on the extracted features"
+  local/nnet3/decode.sh --affix 2stage --acwt $acwt --post-decode-acwt $post_decode_acwt \
+    --frames-per-chunk 150 --nj $nj --ivector-dir $ivector_extractor \
+    ${out_dir} $lang_dir $asr_model_dir/graph${lm_suffix} $asr_model_dir/
+fi
+
diff --git a/egs/libri_css/s5_mono/local/decode_oracle.sh b/egs/libri_css/s5_mono/local/decode_oracle.sh
new file mode 100755
index 00000000000..6e82142f927
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/decode_oracle.sh
@@ -0,0 +1,125 @@
+#!/usr/bin/env bash
+#
+# Based mostly on the TED-LIUM and Switchboard recipe
+#
+# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal)
+# Apache 2.0
+#
+# This script performs recognition with oracle speaker and segment information
+
+# Begin configuration section.
+decode_nj=20
+stage=0
+test_sets=
+lang_dir=
+lm_suffix=
+nnet3_affix=_cleaned # affix for the chain directory name
+affix=1d   # affix for the TDNN directory name
+rnnlm_rescore=false
+
+# End configuration section
+. ./utils/parse_options.sh
+
+. ./cmd.sh
+. ./path.sh
+
+# RNNLM rescore options
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+pruned_rescore=true
+rnnlm_dir=exp/rnnlm_lstm_1a
+
+dir=exp/chain${nnet3_affix}/tdnn_${affix}
+
+# Get dev and eval set names from the test_sets
+dev_set=$( echo $test_sets | cut -d " " -f1 )
+eval_set=$( echo $test_sets | cut -d " " -f2 )
+
+
+set -e # exit on error
+
+##########################################################################
+# DECODING: we perform 2 stage decoding.
+##########################################################################
+
+if [ $stage -le 0 ]; then
+  # First the options that are passed through to run_ivector_common.sh
+  # (some of which are also used in this script directly).
+
+  # The rest are configs specific to this script.  Most of the parameters
+  # are just hardcoded at this level, in the commands below.
+  echo "$0: decode data..."
+  
+  # training options
+  # training chunk-options
+  chunk_width=150,110,100
+  # we don't need extra left/right context for TDNN systems.
+  chunk_left_context=0
+  chunk_right_context=0
+  
+  utils/mkgraph.sh \
+      --self-loop-scale 1.0 $lang_dir \
+      $dir $dir/graph${lm_suffix} || exit 1;
+
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      local/nnet3/decode.sh --affix 2stage --pass2-decode-opts "--min-active 1000" \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --frames-per-chunk 150 --nj $decode_nj \
+        --ivector-dir exp/nnet3${nnet3_affix} \
+        data/${data}_oracle $lang_dir \
+        $dir/graph${lm_suffix} \
+        exp/chain${nnet3_affix}/tdnn_${affix}
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+##########################################################################
+# Scoring: here we obtain wer per condition and overall WER
+##########################################################################
+
+if [ $stage -le 1 ]; then
+  # please specify both dev and eval set directories so that the search parameters
+  # (insertion penalty and language model weight) will be tuned using the dev set
+  local/score_reco_oracle.sh \
+      --dev exp/chain${nnet3_affix}/tdnn_${affix}/decode_${dev_set}_oracle_2stage \
+      --eval exp/chain${nnet3_affix}/tdnn_${affix}/decode_${eval_set}_oracle_2stage
+fi
+
+############################################################################
+# RNNLM rescoring
+############################################################################
+if $rnnlm_rescore; then
+  if [ $stage -le 2 ]; then
+    echo "$0: Perform RNNLM lattice-rescoring"
+    pruned=
+    ac_model_dir=exp/chain${nnet3_affix}/tdnn_${affix}
+    if $pruned_rescore; then
+      pruned=_pruned
+    fi
+    for decode_set in $test_sets; do
+      decode_dir=${ac_model_dir}/decode_${decode_set}_oracle_2stage
+      # Lattice rescoring
+      rnnlm/lmrescore$pruned.sh \
+          --cmd "$decode_cmd --mem 8G" \
+          --weight 0.45 --max-ngram-order $ngram_order \
+          $lang_dir $rnnlm_dir \
+          data/${decode_set}_oracle_hires ${decode_dir} \
+          ${ac_model_dir}/decode_${decode_set}_oracle_2stage_rescore
+    done
+  fi
+  if [ $stage -le 3 ]; then
+    echo "$0: WERs after rescoring with $rnnlm_dir"
+    local/score_reco_oracle.sh \
+        --dev exp/chain${nnet3_affix}/tdnn_${affix}/decode_${dev_set}_oracle_2stage${rescore_dir_suffix} \
+        --eval exp/chain${nnet3_affix}/tdnn_${affix}/decode_${eval_set}_oracle_2stage${rescore_dir_suffix}
+  fi
+fi
diff --git a/egs/libri_css/s5_mono/local/detect_speech_activity.sh b/egs/libri_css/s5_mono/local/detect_speech_activity.sh
new file mode 100755
index 00000000000..1b9b062fb8b
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/detect_speech_activity.sh
@@ -0,0 +1,225 @@
+#!/usr/bin/env bash
+
+# Copyright 2016-17  Vimal Manohar
+#              2017  Nagendra Kumar Goel
+# Apache 2.0.
+
+# This script does nnet3-based speech activity detection given an input 
+# kaldi data directory and outputs a segmented kaldi data directory.
+
+set -e 
+set -o pipefail
+set -u
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+
+affix=  # Affix for the segmentation
+nj=32
+cmd=queue.pl
+stage=-1
+
+# Feature options (Must match training)
+mfcc_config=conf/mfcc_hires.conf
+feat_affix=   # Affix for the type of feature used
+
+output_name=output   # The output node in the network
+sad_name=sad    # Base name for the directory storing the computed loglikes
+                # Can be music for music detection
+segmentation_name=segmentation  # Base name for the directory doing segmentation
+                                # Can be segmentation_music for music detection
+
+# SAD network config
+iter=final  # Model iteration to use
+
+# Contexts must ideally match training for LSTM models, but
+# may not necessarily for stats components
+extra_left_context=0  # Set to some large value, typically 40 for LSTM (must match training)
+extra_right_context=0  
+extra_left_context_initial=-1
+extra_right_context_final=-1
+frames_per_chunk=150
+
+# Decoding options
+graph_opts="--min-silence-duration=0.03 --min-speech-duration=0.3 --max-speech-duration=10.0"
+acwt=1.0
+
+# These <from>_in_<to>_weight represent the fraction of <from> probability 
+# to transfer to <to> class.
+# e.g. --speech-in-sil-weight=0.0 --garbage-in-sil-weight=0.0 --sil-in-speech-weight=0.0 --garbage-in-speech-weight=0.3
+transform_probs_opts=""
+
+# Postprocessing options
+segment_padding=0.2   # Duration (in seconds) of padding added to segments 
+min_segment_dur=0   # Minimum duration (in seconds) required for a segment to be included
+                    # This is before any padding. Segments shorter than this duration will be removed.
+                    # This is an alternative to --min-speech-duration above.
+merge_consecutive_max_dur=0   # Merge consecutive segments as long as the merged segment is no longer than this many
+                              # seconds. The segments are only merged if their boundaries are touching.
+                              # This is after padding by --segment-padding seconds.
+                              # 0 means do not merge. Use 'inf' to not limit the duration.
+cleanup=false  # If true, remove files created during feature extraction
+
+echo $* 
+
+. utils/parse_options.sh
+
+if [ $# -ne 2 ]; then
+  echo "This script does nnet3-based speech activity detection given an input kaldi "
+  echo "data directory and outputs an output kaldi data directory."
+  echo "See script for details of the options to be supplied."
+  echo "Usage: $0 <src-data-dir> <sad-nnet-dir>"
+  echo " e.g.: $0 ~/workspace/egs/ami/s5b/data/sdm1/dev exp/nnet3_sad_snr/nnet_tdnn_j_n4 \\"
+  echo "    mfcc_hires exp/segmentation_sad_snr/nnet_tdnn_j_n4"
+  echo ""
+  echo "Options: "
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --nj <num-job>                                 # number of parallel jobs to run."
+  echo "  --stage <stage>                                # stage to do partial re-run from."
+  echo "  --convert-data-dir-to-whole <true|false>    # If true, the input data directory is "
+  echo "                                              # first converted to whole data directory (i.e. whole recordings) "
+  echo "                                              # and segmentation is done on that."
+  echo "                                              # If false, then the original segments are "
+  echo "                                              # retained and they are split into sub-segments."
+  echo "  --output-name <name>    # The output node in the network"
+  echo "  --extra-left-context  <context|0>   # Set to some large value, typically 40 for LSTM (must match training)"
+  echo "  --extra-right-context  <context|0>   # For BLSTM or statistics pooling"
+  echo "  --cleanup <true|false>  # Remove files created during feature extraction"
+  exit 1
+fi
+
+src_data_dir=$1   # The input data directory that needs to be segmented.
+                  # If convert_data_dir_to_whole is true, any segments in that will be ignored.
+sad_nnet_dir=$2   # The SAD neural network
+
+dir=exp/segmentation${affix}
+
+affix=${affix:+_$affix}
+feat_affix=${feat_affix:+_$feat_affix}
+
+data_id=`basename $src_data_dir`
+sad_dir=${dir}/${sad_name}${affix}_${data_id}${feat_affix}
+seg_dir=${dir}/${segmentation_name}${affix}_${data_id}${feat_affix}
+
+###############################################################################
+## Forward pass through the network network and dump the log-likelihoods.
+###############################################################################
+
+frame_subsampling_factor=1
+if [ -f $sad_nnet_dir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $sad_nnet_dir/frame_subsampling_factor)
+fi
+
+if [ $stage -le 1 ]; then
+  # mfccdir should be some place with a largish disk where you
+  # want to store MFCC features.
+  mfccdir=mfcc
+  steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \
+    --mfcc-config conf/mfcc_hires.conf \
+    $src_data_dir exp/make_mfcc/$data_id $mfccdir
+fi
+
+mkdir -p $dir
+if [ $stage -le 2 ]; then
+  if [ "$(readlink -f $sad_nnet_dir)" != "$(readlink -f $dir)" ]; then
+    cp $sad_nnet_dir/cmvn_opts $dir || exit 1
+  fi
+
+  ########################################################################
+  ## Initialize neural network for decoding using the output $output_name
+  ########################################################################
+
+  if [ ! -z "$output_name" ] && [ "$output_name" != output ]; then
+    $cmd $dir/log/get_nnet_${output_name}.log \
+      nnet3-copy --edits="rename-node old-name=$output_name new-name=output" \
+      $sad_nnet_dir/$iter.raw $dir/${iter}_${output_name}.raw || exit 1
+    iter=${iter}_${output_name}
+  else 
+    if ! diff $sad_nnet_dir/$iter.raw $dir/$iter.raw; then
+      cp $sad_nnet_dir/$iter.raw $dir/
+    fi
+  fi
+
+  steps/nnet3/compute_output.sh --nj $nj --cmd "$cmd" \
+    --iter ${iter} \
+    --extra-left-context $extra_left_context \
+    --extra-right-context $extra_right_context \
+    --extra-left-context-initial $extra_left_context_initial \
+    --extra-right-context-final $extra_right_context_final \
+    --frames-per-chunk $frames_per_chunk --apply-exp true \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    ${src_data_dir} $dir $sad_dir || exit 1
+fi
+
+###############################################################################
+## Prepare FST we search to make speech/silence decisions.
+###############################################################################
+
+utils/data/get_utt2dur.sh --nj $nj --cmd "$cmd" $src_data_dir || exit 1
+frame_shift=$(utils/data/get_frame_shift.sh $src_data_dir) || exit 1
+
+graph_dir=${dir}/graph_${output_name}
+if [ $stage -le 3 ]; then
+  mkdir -p $graph_dir
+
+  # 1 for silence and 2 for speech
+  cat <<EOF > $graph_dir/words.txt
+<eps> 0
+silence 1
+speech 2
+EOF
+
+  $cmd $graph_dir/log/make_graph.log \
+    steps/segmentation/internal/prepare_sad_graph.py $graph_opts \
+      --frame-shift=$(perl -e "print $frame_shift * $frame_subsampling_factor") - \| \
+    fstcompile --isymbols=$graph_dir/words.txt --osymbols=$graph_dir/words.txt '>' \
+      $graph_dir/HCLG.fst
+fi
+
+###############################################################################
+## Do Viterbi decoding to create per-frame alignments.
+###############################################################################
+
+post_vec=$sad_nnet_dir/post_${output_name}.vec
+if [ ! -f $sad_nnet_dir/post_${output_name}.vec ]; then
+  if [ ! -f $sad_nnet_dir/post_${output_name}.txt ]; then
+    echo "$0: Could not find $sad_nnet_dir/post_${output_name}.vec. "
+    echo "Re-run the corresponding stage in the training script possibly "
+    echo "with --compute-average-posteriors=true or compute the priors "
+    echo "from the training labels"
+    exit 1
+  else
+    post_vec=$sad_nnet_dir/post_${output_name}.txt
+  fi
+fi
+
+mkdir -p $seg_dir
+if [ $stage -le 4 ]; then
+  steps/segmentation/internal/get_transform_probs_mat.py \
+    --priors="$post_vec" $transform_probs_opts > $seg_dir/transform_probs.mat
+
+  steps/segmentation/decode_sad.sh --acwt $acwt --cmd "$cmd" \
+    --nj $nj \
+    --transform "$seg_dir/transform_probs.mat" \
+    $graph_dir $sad_dir $seg_dir
+fi
+
+###############################################################################
+## Post-process segmentation to create kaldi data directory.
+###############################################################################
+
+if [ $stage -le 5 ]; then
+  steps/segmentation/post_process_sad_to_segments.sh \
+    --segment-padding $segment_padding --min-segment-dur $min_segment_dur \
+    --merge-consecutive-max-dur $merge_consecutive_max_dur \
+    --cmd "$cmd" --frame-shift $(perl -e "print $frame_subsampling_factor * $frame_shift") \
+    ${src_data_dir} ${seg_dir} ${seg_dir}
+fi
+
+sed 's:-:_:g' ${seg_dir}/segments > $src_data_dir/segments # to be consistent for scoring
+
+if [ $cleanup ]; then
+  rm $src_data_dir/{feats.scp,frame_shift,utt2dur,utt2num_frames} 2> /dev/null
+fi
+
+echo "$0: Created output segments in ${src_data_dir}"
+exit 0
\ No newline at end of file
diff --git a/egs/libri_css/s5_mono/local/diarization/post_process_css_rttm.py b/egs/libri_css/s5_mono/local/diarization/post_process_css_rttm.py
new file mode 100755
index 00000000000..b90ad3d97b9
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/diarization/post_process_css_rttm.py
@@ -0,0 +1,121 @@
+#! /usr/bin/env python3
+# Copyright   2020   Desh Raj
+# Apache 2.0.
+"""This script takes an RTTM file and removes same-speaker segments
+which may be present at the same time across streams. This is meant
+to be used as a post-processing step after performing clustering-based
+diarization on top of separated streams of audio. The idea is to
+eliminate false alarms caused by leakage, since the separation
+method may not be perfect."""
+
+import argparse, os
+import itertools
+from collections import defaultdict
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="""This script takes an RTTM file and removes same-speaker segments
+                which may be present at the same time across streams. This is meant
+                to be used as a post-processing step after performing clustering-based
+                diarization on top of separated streams of audio. The idea is to
+                eliminate false alarms caused by leakage, since the separation
+                method may not be perfect.""",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("input_rttm", type=str,
+                        help="path of input rttm file")
+    parser.add_argument("output_rttm", type=str,
+                        help="path of output rttm file")
+    args = parser.parse_args()
+    return args
+
+class Segment:
+    def __init__(self, parts):
+        self.reco_id = '_'.join(parts[1].split('_')[:-1])
+        self.stream = int(parts[1].split('_')[-1])
+        self.start_time = float(parts[3])
+        self.duration = float(parts[4])
+        self.end_time = self.start_time + self.duration
+        self.label = int(parts[7])
+
+
+def main():
+    args = get_args()
+
+    # First we read all segments and store as a list of objects
+    segments = []
+    with open(args.input_rttm,'r') as f:
+        for line in f.readlines():
+            parts = line.strip().split()
+            segments.append(Segment(parts))
+
+    groupfn = lambda x: (x.reco_id,x.label)
+    sort(segments, key=groupfn)
+    # We group the segment list into a dictionary indexed by (reco_id, spk_id)
+    reco_and_spk_to_segs = defaultdict(list,
+        {uid : list(g) for uid, g in itertools.groupby(segments, groupfn)})
+
+    reco_and_spk_to_final_segs = {}
+    for uid in reco_and_spk_to_segs.keys():
+        reco_id, spk_id = uid
+        segs = reco_and_spk_to_segs[uid]
+        tokens = []
+        for seg in segs:
+            tokens.append(('BEG',seg.start_time,seg.stream))
+            tokens.append(('END',seg.end_time,seg.stream))
+        tokens.sort(key=lambda x:x[1])
+        
+        # Remove segments which lie completely inside another segment
+        running_segs = {}
+        new_segs = [] # (start_time, end_time, stream)
+        for token in tokens:
+            if token[0] == 'BEG':
+                running_segs[token[2]] = token[1]
+            else:
+                seg_start = running_segs[token[2]]
+                seg_end = token[1]
+                seg_stream = token[2]
+                new_seg = (seg_start, seg_end, seg_stream)
+                del running_segs[token[2]]
+
+                # if this segment was the only running segment, then append
+                if len(running_segs) == 0:
+                    new_segs.append(new_seg)
+                    continue
+                
+                # if any running segment started before this one, it means, this
+                # segment is totally enclosed within the other, so we don't add it
+                if not any(i < new_seg[0] for i in running_segs.values()):
+                    new_segs.append(new_seg)
+        
+        new_segs.sort(key=lambda x: x[0])
+        num_segs = len(new_segs)
+        # Now we have partially overlapping segments. We divide the overlapping
+        # portion equally.
+        final_segs = [] # (start_time, end_time, stream)
+        for i in range(num_segs):
+            seg = new_segs[i]
+            # If it is last segment in recording or last contiguous segment, add it to new_segs
+            if (i == num_segs-1 or seg[1] <= new_segs[i+1][0]):
+                final_segs.append(seg)
+            # Otherwise split overlapping interval between current and next segment
+            else:
+                avg = (new_segs[i+1][0] + seg[1]) / 2
+                final_segs.append((seg[0], avg, seg[2]))
+                if not (avg < new_segs[i+1][1]):
+                    print (reco_id, spk_id, seg, new_segs[i+1])
+                new_segs[i+1] = (avg, new_segs[i+1][1], new_segs[i+1][2])
+                new_segs[i+1:].sort(key=lambda x: x[0])
+        reco_and_spk_to_final_segs[(reco_id, spk_id)] = final_segs
+    
+    rttm_str = "SPEAKER {0} 1 {1:7.3f} {2:7.3f} <NA> <NA> {3} <NA> <NA>\n"
+    with open(args.output_rttm, 'w') as f:
+        for (reco_id, spk_id) in sorted(reco_and_spk_to_final_segs):
+            segs = reco_and_spk_to_final_segs[(reco_id, spk_id)]
+            for seg in segs:
+                utt_id = "{}_{}".format(reco_id, seg[2])
+                dur = seg[1] - seg[0]
+                if dur > 0.025:
+                    f.write(rttm_str.format(utt_id, seg[0], dur, spk_id))
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/libri_css/s5_mono/local/diarization/scluster.sh b/egs/libri_css/s5_mono/local/diarization/scluster.sh
new file mode 100755
index 00000000000..374ec192031
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/diarization/scluster.sh
@@ -0,0 +1,100 @@
+#!/bin/bash
+
+# Copyright       2016  David Snyder
+#            2017-2018  Matthew Maciejewski
+#                 2020  Maxim Korenevsky (STC-innovations Ltd)
+# Apache 2.0.
+
+# This script performs spectral clustering using scored
+# pairs of subsegments and produces a rttm file with speaker
+# labels derived from the clusters.
+
+# Begin configuration section.
+cmd="run.pl"
+stage=0
+nj=10
+cleanup=true
+rttm_channel=0
+reco2num_spk=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <src-dir> <dir>"
+  echo " e.g.: $0 exp/ivectors_callhome exp/ivectors_callhome/results"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --nj <n|10>                                      # Number of jobs (also see num-processes and num-threads)"
+  echo "  --stage <stage|0>                                # To control partial reruns"
+  echo "  --rttm-channel <rttm-channel|0>                  # The value passed into the RTTM channel field. Only affects"
+  echo "                                                   # the format of the RTTM file."
+  echo "  --reco2num-spk <reco2num-spk-file>               # File containing mapping of recording ID"
+  echo "                                                   # to number of speakers. Used instead of threshold"
+  echo "                                                   # as stopping criterion if supplied."
+  echo "  --cleanup <bool|false>                           # If true, remove temporary files"
+  exit 1;
+fi
+
+srcdir=$1/cossim_scores
+xvec_dir=$1
+dir=$2
+
+mkdir -p $dir/tmp
+
+for f in $srcdir/scores.scp $srcdir/spk2utt $srcdir/utt2spk $xvec_dir/segments.bak ; do
+  [ ! -f $f ] && echo "No such file $f" && exit 1;
+done
+
+cp $srcdir/spk2utt $dir/tmp/
+cp $srcdir/utt2spk $dir/tmp/
+cp $srcdir/segments $dir/tmp/
+utils/fix_data_dir.sh $dir/tmp > /dev/null
+
+if [ ! -z $reco2num_spk ]; then
+  reco2num_spk="ark,t:$reco2num_spk"
+fi
+
+sdata=$dir/tmp/split$nj;
+utils/split_data.sh $dir/tmp $nj || exit 1;
+
+# Set various variables.
+mkdir -p $dir/log
+
+feats="utils/filter_scp.pl $sdata/JOB/spk2utt $srcdir/scores.scp |"
+
+reco2num_spk_opt=
+if [ ! $reco2num_spk == "" ]; then
+  reco2num_spk_opt="--reco2num-spk $reco2num_spk"
+fi
+
+if [ $stage -le 0 ]; then
+  echo "$0: clustering scores"
+  for j in `seq $nj`; do 
+    utils/filter_scp.pl $sdata/$j/spk2utt $srcdir/scores.scp > $dir/scores.$j.scp
+  done
+  $cmd JOB=1:$nj $dir/log/spectral_cluster.JOB.log \
+    diarization/spec_clust.py $reco2num_spk_opt \
+      scp:$dir/scores.JOB.scp ark,t:$sdata/JOB/spk2utt ark,t:$dir/labels.JOB || exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: combining labels"
+  for j in $(seq $nj); do cat $dir/labels.$j; done > $dir/labels || exit 1;
+fi
+
+# Note that here we use the segments.bak file which contains mapping from subsegments to original stream
+# This is done to ensure that segments do not cross streams (since we will perform ASR on them later)
+if [ $stage -le 2 ]; then
+  echo "$0: computing RTTM"
+  diarization/make_rttm.py --rttm-channel $rttm_channel $xvec_dir/segments.bak $dir/labels $dir/rttm || exit 1;
+fi
+
+if $cleanup ; then
+  rm -r $dir/tmp || exit 1;
+fi
\ No newline at end of file
diff --git a/egs/libri_css/s5_mono/local/diarize.sh b/egs/libri_css/s5_mono/local/diarize.sh
new file mode 100755
index 00000000000..83e6fe72267
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/diarize.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+# Copyright   2019   David Snyder
+#             2020   Desh Raj
+
+# Apache 2.0.
+#
+# This script takes an input directory that has a segments file (and
+# a feats.scp file), and performs diarization on it.  The output directory
+# contains an RTTM file which can be used to resegment the input data.
+
+stage=0
+nj=10
+cmd="run.pl"
+ref_rttm=
+score_overlaps_only=true
+
+echo "$0 $@"  # Print the command line for logging
+
+set -e
+
+. ./path.sh
+. parse_options.sh
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <model-dir> <in-data-dir> <out-dir>"
+  echo "e.g.: $0 exp/xvector_nnet_1a  data/dev exp/dev_diarization"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs."
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --ref_rttm ./local/dev_rttm                      # the location of the reference RTTM file"
+  exit 1;
+fi
+
+model_dir=$1
+data_in=$2
+out_dir=$3
+
+name=$(basename "$data_in")
+
+for f in $data_in/feats.scp $data_in/segments $model_dir/plda \
+  $model_dir/final.raw $model_dir/extract.config; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+if [ $stage -le 1 ]; then
+  echo "$0: computing features for x-vector extractor"
+  utils/fix_data_dir.sh data/${name}
+  rm -rf data/${name}_cmn
+  local/nnet3/xvector/prepare_feats.sh --nj $nj --cmd "$cmd" \
+    data/$name data/${name}_cmn exp/${name}_cmn
+  cp data/$name/segments exp/${name}_cmn/
+  utils/fix_data_dir.sh data/${name}_cmn
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: extracting x-vectors for all segments"
+  diarization/nnet3/xvector/extract_xvectors.sh --cmd "$cmd" \
+    --nj $nj --window 1.5 --period 0.75 --apply-cmn false \
+    --min-segment 0.5 $model_dir \
+    data/${name}_cmn $out_dir/xvectors_${name}
+fi
+
+# Perform PLDA scoring
+if [ $stage -le 3 ]; then
+  # Perform PLDA scoring on all pairs of segments for each recording.
+  echo "$0: performing PLDA scoring between all pairs of x-vectors"
+  diarization/nnet3/xvector/score_plda.sh --cmd "$cmd" \
+    --target-energy 0.5 \
+    --nj $nj $model_dir/ $out_dir/xvectors_${name} \
+    $out_dir/xvectors_${name}/plda_scores
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: performing clustering using PLDA scores (threshold tuned on dev)"
+  diarization/cluster.sh --cmd "$cmd" --nj $nj \
+    --rttm-channel 1 --threshold 0.4 \
+    $out_dir/xvectors_${name}/plda_scores $out_dir
+  echo "$0: wrote RTTM to output directory ${out_dir}"
+fi
+
+hyp_rttm=${out_dir}/rttm
+
+if [ $stage -le 5 ]; then
+  echo "Diarization results for "${name}
+  local/dscore.sh --score-overlaps-only $score_overlaps_only \
+    $ref_rttm $hyp_rttm
+fi
diff --git a/egs/libri_css/s5_mono/local/diarize_css.sh b/egs/libri_css/s5_mono/local/diarize_css.sh
new file mode 100755
index 00000000000..4808f763a59
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/diarize_css.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+# Copyright   2019   David Snyder
+#             2020   Desh Raj
+
+# Apache 2.0.
+#
+# This script is exactly the same as local/diarize.sh until
+# stage 2 (x-vector extraction), but after that, it is slightly
+# different. The key difference is that since we have multiple
+# streams of audio (and subsequently multiple streams of subsegments)
+# from the same recording, we want to perform cosine scoring across 
+# all of these streams. 
+
+stage=0
+nj=10
+cmd="run.pl"
+ref_rttm=
+window=1.5
+period=0.75
+min_segment=0.5
+post_process_rttm=false # set to true to remove same speaker segments in different
+                       # streams at the same time
+score_overlaps_only=true
+
+echo "$0 $@"  # Print the command line for logging
+
+set -e
+
+. ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <model-dir> <in-data-dir> <out-dir>"
+  echo "e.g.: $0 exp/xvector_nnet_1a  data/dev exp/dev_diarization"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs."
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --ref_rttm ./local/dev_rttm                      # the location of the reference RTTM file"
+  exit 1;
+fi
+
+model_dir=$1
+data_in=$2
+out_dir=$3
+
+name=$(basename "$data_in")
+
+for f in $data_in/feats.scp $data_in/segments \
+  $model_dir/final.raw $model_dir/extract.config; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+if [ $stage -le 1 ]; then
+  echo "$0: computing features for x-vector extractor"
+  utils/fix_data_dir.sh data/${name}
+  rm -rf data/${name}_cmn
+  local/nnet3/xvector/prepare_feats.sh --nj $nj --cmd "$cmd" \
+    data/$name data/${name}_cmn exp/${name}_cmn
+  cp data/$name/segments exp/${name}_cmn/
+  utils/fix_data_dir.sh data/${name}_cmn
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: extracting x-vectors for all segments"
+  diarization/nnet3/xvector/extract_xvectors.sh --cmd "$cmd" \
+    --nj $nj --window $window --period $period --apply-cmn false \
+    --min-segment $min_segment $model_dir \
+    data/${name}_cmn $out_dir/xvectors_${name}
+fi
+
+# Perform cosine scoring. The following stage is the key difference.
+# We change the segments and utt2spk files in the xvector directory
+# to reflect that the subsegments are from the same recording. 
+# But we also keep the original segments file since that will
+# be required in subsequent stages for ASR decoding.
+if [ $stage -le 3 ]; then
+  # The if condition is just to ensure that we don't accidentally
+  # make this modification more than once (which would mess up the
+  # segments file)
+  if [ ! -f ${out_dir}/xvectors_${name}/segments.bak ]; then
+    mv ${out_dir}/xvectors_${name}/segments ${out_dir}/xvectors_${name}/segments.bak
+    mv ${out_dir}/xvectors_${name}/utt2spk ${out_dir}/xvectors_${name}/utt2spk.bak
+    awk '{$2=$2;sub(/_[0-9]*$/, "", $2); print}' ${out_dir}/xvectors_${name}/segments.bak \
+      > ${out_dir}/xvectors_${name}/segments
+    awk '{$2=$2;sub(/_[0-9]*$/, "", $2); print}' ${out_dir}/xvectors_${name}/utt2spk.bak \
+      > ${out_dir}/xvectors_${name}/utt2spk
+    utils/utt2spk_to_spk2utt.pl ${out_dir}/xvectors_${name}/utt2spk > ${out_dir}/xvectors_${name}/spk2utt
+  fi
+fi
+
+# nj needs to be changed since we now have #wav/#streams number
+# of recordings. Just get it from the segments file
+new_nj=$(cat ${out_dir}/xvectors_${name}/segments | cut -d' ' -f2 | uniq | wc -l)
+nj=$(echo $((nj>new_nj ? new_nj : nj)))
+
+if [ $stage -le 4 ]; then
+  # Perform cosine similarity scoring on all pairs of segments for each recording.
+  echo "$0: performing cosine similarity scoring between all pairs of x-vectors"
+  diarization/score_cossim.sh --cmd "$cmd" \
+    --nj $nj $out_dir/xvectors_${name} \
+    $out_dir/xvectors_${name}/cossim_scores
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: performing spectral clustering using cosine similarity scores"
+  local/diarization/scluster.sh --cmd "$cmd" --nj $nj \
+    --rttm-channel 1 \
+    $out_dir/xvectors_${name} $out_dir
+  echo "$0: wrote RTTM to output directory ${out_dir}"
+
+  # The above clustering generates RTTM with reco separated into streams,
+  # so we have to remove the stream name for evaluation.
+  awk '{$2=$2;sub(/_[0-9]*$/, "", $2); print}' ${out_dir}/rttm \
+    > ${out_dir}/rttm.comb
+fi
+
+if [ $stage -le 6 ] && [ $post_process_rttm == "true" ]; then
+  echo "$0: applying post-processing to remove simultaneous same-speaker segments"
+  local/diarization/post_process_css_rttm.py ${out_dir}/rttm ${out_dir}/rttm.post
+
+  awk '{$2=$2;sub(/_[0-9]*$/, "", $2); print}' ${out_dir}/rttm.post \
+    > ${out_dir}/rttm.comb
+fi
+
+hyp_rttm=${out_dir}/rttm.comb
+
+if [ $stage -le 7 ]; then
+  echo "Diarization results for "${name}
+  local/dscore.sh --score-overlaps-only $score_overlaps_only \
+    $ref_rttm $hyp_rttm
+fi
diff --git a/egs/libri_css/s5_mono/local/diarize_spectral.sh b/egs/libri_css/s5_mono/local/diarize_spectral.sh
new file mode 100755
index 00000000000..0b12e5a57ea
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/diarize_spectral.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+# Copyright   2019   David Snyder
+#             2020   Desh Raj
+
+# Apache 2.0.
+#
+# This is similar to local/diarize.sh but uses spectral clustering instead
+# of AHC.
+
+stage=0
+nj=10
+cmd="run.pl"
+ref_rttm=
+score_overlaps_only=true
+
+echo "$0 $@"  # Print the command line for logging
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+if [ $# != 3 ]; then
+  echo "Usage: $0 <model-dir> <in-data-dir> <out-dir>"
+  echo "e.g.: $0 exp/xvector_nnet_1a  data/dev exp/dev_diarization"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs."
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --ref_rttm ./local/dev_rttm                      # the location of the reference RTTM file"
+  exit 1;
+fi
+
+model_dir=$1
+data_in=$2
+out_dir=$3
+
+name=$(basename "$data_in")
+
+for f in $data_in/feats.scp $data_in/segments \
+  $model_dir/final.raw $model_dir/extract.config; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+if [ $stage -le 1 ]; then
+  echo "$0: computing features for x-vector extractor"
+  utils/fix_data_dir.sh data/${name}
+  rm -rf data/${name}_cmn
+  local/nnet3/xvector/prepare_feats.sh --nj $nj --cmd "$cmd" \
+    data/$name data/${name}_cmn exp/${name}_cmn
+  cp data/$name/segments exp/${name}_cmn/
+  utils/fix_data_dir.sh data/${name}_cmn
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: extracting x-vectors for all segments"
+  diarization/nnet3/xvector/extract_xvectors.sh --cmd "$cmd" \
+    --nj $nj --window 1.5 --period 0.75 --apply-cmn false \
+    --min-segment 0.5 $model_dir \
+    data/${name}_cmn $out_dir/xvectors_${name}
+fi
+
+# Perform cosine similarity scoring
+if [ $stage -le 3 ]; then
+  # Perform cosine similarity scoring on all pairs of segments for each recording.
+  echo "$0: performing cosine similarity scoring between all pairs of x-vectors"
+  diarization/score_cossim.sh --cmd "$cmd" \
+    --nj $nj $out_dir/xvectors_${name} \
+    $out_dir/xvectors_${name}/cossim_scores
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: performing spectral clustering using cosine similarity scores"
+  diarization/scluster.sh --cmd "$cmd" --nj $nj \
+    --rttm-channel 1 \
+    $out_dir/xvectors_${name}/cossim_scores $out_dir
+  echo "$0: wrote RTTM to output directory ${out_dir}"
+fi
+
+hyp_rttm=${out_dir}/rttm
+
+if [ $stage -le 5 ]; then
+  echo "Diarization results for "${name}
+  local/dscore.sh --score-overlaps-only $score_overlaps_only \
+    $ref_rttm $hyp_rttm
+fi
diff --git a/egs/libri_css/s5_mono/local/download_and_untar.sh b/egs/libri_css/s5_mono/local/download_and_untar.sh
new file mode 100755
index 00000000000..5cf6adde8bc
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/download_and_untar.sh
@@ -0,0 +1,100 @@
+#!/usr/bin/env bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a15/vpanayotov/data www.openslr.org/resources/11 dev-clean"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: dev-clean, test-clean, dev-other, test-other,"
+  echo "          train-clean-100, train-clean-360, train-other-500."
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+part_ok=false
+list="dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500"
+for x in $list; do 
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/LibriSpeech/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+
+# sizes of the archive files in bytes.  This is some older versions.
+sizes_old="371012589 347390293 379743611 361838298 6420417880 23082659865 30626749128"
+# sizes_new is the archive file sizes of the final release.  Some of these sizes are of
+# things we probably won't download.
+sizes_new="337926286 314305928 695964615 297279345 87960560420 33373768 346663984 328757843 6387309499 23049477885 30593501606"
+
+if [ -f $data/$part.tar.gz ]; then
+  size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes_old $sizes_new; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tar.gz
+  else
+    echo "$data/$part.tar.gz exists and appears to be complete."
+  fi
+fi
+
+pushd $data
+
+if [ ! -f $part.tar.gz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tar.gz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+if ! tar -xvzf $part.tar.gz; then
+  echo "$0: error un-tarring archive $data/$part.tar.gz"
+  exit 1;
+fi
+
+popd >&/dev/null
+
+touch $data/LibriSpeech/$part/.complete
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
+  rm $data/$part.tar.gz
+fi
diff --git a/egs/libri_css/s5_mono/local/download_diarizer.sh b/egs/libri_css/s5_mono/local/download_diarizer.sh
new file mode 100755
index 00000000000..a0ef096e10d
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/download_diarizer.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+#
+# Copyright  2020  University of Stuttgart (Author: Pavel Denisov)
+# Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+. ./utils/parse_options.sh  # accept options
+
+. ./path.sh
+
+echo >&2 "$0" "$@"
+if [ $# -ne 0 ] ; then
+  echo >&2 "$0" "$@"
+  echo >&2 "$0: Error: wrong number of arguments"
+  echo -e >&2 "Usage:\n  $0"
+  exit 1
+fi
+
+
+set -e -o pipefail
+
+mkdir -p downloads
+dir=$(mktemp -d ./downloads/lcss.XXXXXXXXX)
+trap "rm -rf ${dir}" EXIT
+
+cd ${dir}
+
+# Download x-vector extractor trained on VocxCeleb2 data
+wget http://kaldi-asr.org/models/12/0012_diarization_v1.tar.gz
+tar -xvzf 0012_diarization_v1.tar.gz
+rm -f 0012_diarization_v1.tar.gz
+
+# Download PLDA model trained on augmented Librispeech data
+rm 0012_diarization_v1/exp/xvector_nnet_1a/plda
+wget https://desh2608.github.io/static/files/jsalt/plda -P 0012_diarization_v1/exp/xvector_nnet_1a/
+cd ../..
+cp -r ${dir}/0012_diarization_v1/exp .
diff --git a/egs/libri_css/s5_mono/local/download_lm.sh b/egs/libri_css/s5_mono/local/download_lm.sh
new file mode 100755
index 00000000000..129ca1edbe3
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/download_lm.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+
+# Copyright 2014 Vassil Panayotov
+# Apache 2.0
+
+if [ $# -ne "2" ]; then
+  echo "Usage: $0 <base-url> <download_dir>"
+  echo "e.g.: $0 http://www.openslr.org/resources/11 data/local/lm"
+  exit 1
+fi
+
+base_url=$1
+dst_dir=$2
+
+# given a filename returns the corresponding file size in bytes
+# The switch cases below can be autogenerated by entering the data directory and running:
+# for f in *; do echo "\"$f\") echo \"$(du -b $f | awk '{print $1}')\";;"; done
+function filesize() {
+  case $1 in
+    "3-gram.arpa.gz") echo "759636181";;
+    "3-gram.pruned.1e-7.arpa.gz") echo "34094057";;
+    "3-gram.pruned.3e-7.arpa.gz") echo "13654242";;
+    "4-gram.arpa.gz") echo "1355172078";;
+    "g2p-model-5") echo "20098243";;
+    "librispeech-lexicon.txt") echo "5627653";;
+    "librispeech-lm-corpus.tgz") echo "1803499244";;
+    "librispeech-lm-norm.txt.gz") echo "1507274412";;
+    "librispeech-vocab.txt") echo "1737588";;
+    *) echo "";;
+  esac
+}
+
+function check_and_download () {
+  [[ $# -eq 1 ]] || { echo "check_and_download() expects exactly one argument!"; return 1; }
+  fname=$1
+  echo "Downloading file '$fname' into '$dst_dir'..."
+  expect_size="$(filesize $fname)"
+  [[ ! -z "$expect_size" ]] || { echo "Unknown file size for '$fname'"; return 1; }
+  if [[ -s $dst_dir/$fname ]]; then
+    # In the following statement, the first version works on linux, and the part
+    # after '||' works on Linux.
+    f=$dst_dir/$fname
+    fsize=$(set -o pipefail; du -b $f 2>/dev/null | awk '{print $1}' || stat '-f %z' $f)
+    if [[ "$fsize" -eq "$expect_size" ]]; then
+      echo "'$fname' already exists and appears to be complete"
+      return 0
+    else
+      echo "WARNING: '$fname' exists, but the size is wrong - re-downloading ..."
+    fi
+  fi
+  wget --no-check-certificate -O $dst_dir/$fname $base_url/$fname || {
+    echo "Error while trying to download $fname!"
+    return 1
+  }
+  f=$dst_dir/$fname
+  # In the following statement, the first version works on linux, and the part after '||'
+  # works on Linux.
+  fsize=$(set -o pipefail; du -b $f 2>/dev/null | awk '{print $1}' || stat '-f %z' $f)
+  [[ "$fsize" -eq "$expect_size" ]] || { echo "$fname: file size mismatch!"; return 1; }
+  return 0
+}
+
+mkdir -p $dst_dir
+
+for f in 3-gram.arpa.gz 3-gram.pruned.1e-7.arpa.gz 3-gram.pruned.3e-7.arpa.gz 4-gram.arpa.gz \
+         g2p-model-5 librispeech-lm-corpus.tgz librispeech-vocab.txt librispeech-lexicon.txt; do
+  check_and_download $f || exit 1
+done
+
+cd $dst_dir
+ln -sf 3-gram.pruned.1e-7.arpa.gz lm_tgmed.arpa.gz
+ln -sf 3-gram.pruned.3e-7.arpa.gz lm_tgsmall.arpa.gz
+ln -sf 3-gram.arpa.gz lm_tglarge.arpa.gz
+ln -sf 4-gram.arpa.gz lm_fglarge.arpa.gz
+
+exit 0
diff --git a/egs/libri_css/s5_mono/local/dscore.sh b/egs/libri_css/s5_mono/local/dscore.sh
new file mode 100644
index 00000000000..43665aba4a2
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/dscore.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+# Copyright  2020   Desh Raj
+
+# Apache 2.0.
+#
+# This script installs a fork of the dscore toolkit 
+# (https://github.com/nryant/dscore), which also supports
+# evaluating the overlapping regions only. It then scores
+# the output sys_rttm based on the provided ref_rttm.
+
+score_overlaps_only=true
+
+echo "$0 $@"  # Print the command line for logging
+
+set -e
+
+. ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <ref-rttm> <hyp-rttm>"
+  echo "e.g.: $0 data/test/rttm exp/test_diarization/rttm"
+  exit 1;
+fi
+
+ref_rttm=$1
+hyp_rttm=$2
+  
+if ! [ -d dscore ]; then
+  git clone https://github.com/desh2608/dscore.git -b libricss --single-branch
+  cd dscore
+  python3 -m pip install --user -r requirements.txt
+  cd ..
+fi
+
+# Create per condition ref and hyp RTTM files for scoring per condition
+mkdir -p tmp
+trap "rm -r tmp" EXIT
+
+conditions="0L 0S OV10 OV20 OV30 OV40"
+cp $ref_rttm tmp/ref.all
+cp $hyp_rttm tmp/hyp.all
+for rttm in ref hyp; do
+  for cond in $conditions; do
+    cat tmp/$rttm.all | grep $cond > tmp/$rttm.$cond
+  done
+done
+
+echo "Scoring all regions..."
+for cond in $conditions 'all'; do
+  echo -n "Condition: $cond: "
+  ref_rttm_path=$(readlink -f tmp/ref.$cond)
+  hyp_rttm_path=$(readlink -f tmp/hyp.$cond)
+  cd dscore
+  python3 score.py -r $ref_rttm_path -s $hyp_rttm_path --global_only
+  cd ..
+done
+
+# We also score overlapping regions only
+if [ $score_overlaps_only == "true" ]; then
+  echo "Scoring overlapping regions..."
+  for cond in $conditions 'all'; do
+    echo -n "Condition: $cond: "
+    ref_rttm_path=$(readlink -f tmp/ref.$cond)
+    hyp_rttm_path=$(readlink -f tmp/hyp.$cond)
+    cd dscore
+    python3 score.py -r $ref_rttm_path -s $hyp_rttm_path --overlap_only --global_only
+    cd ..
+  done
+fi
diff --git a/egs/libri_css/s5_mono/local/extract_vad_weights.sh b/egs/libri_css/s5_mono/local/extract_vad_weights.sh
new file mode 100755
index 00000000000..d5019f100b1
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/extract_vad_weights.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+
+# Copyright 2016 Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti)
+#           2019 Vimal Manohar
+# Apache 2.0.
+
+# This script converts lattices available from a first pass decode into a per-frame weights file
+# The ctms generated from the lattices are filtered. Silence frames are assigned a low weight (e.g.0.00001)
+# and voiced frames have a weight of 1.
+
+set -e
+
+stage=1
+cmd=run.pl
+silence_weight=0.00001
+#end configuration section.
+
+. ./cmd.sh
+
+[ -f ./path.sh ] && . ./path.sh
+. utils/parse_options.sh || exit 1;
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <input-decode-dir> <output-wts-file-gzipped>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  exit 1;
+fi
+
+data_dir=$1
+lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
+decode_dir=$3
+output_wts_file_gz=$4
+
+if [ $stage -le 1 ]; then
+  echo "$0: generating CTM from input lattices"
+  steps/get_ctm_conf.sh --cmd "$cmd" \
+    --use-segments false \
+    $data_dir \
+    $lang \
+    $decode_dir
+fi
+
+if [ $stage -le 2 ]; then
+  name=`basename $data_dir`
+  # we just take the ctm from LMWT 10, it doesn't seem to affect the results a lot
+  ctm=$decode_dir/score_10/$name.ctm
+  echo "$0: generating weights file from ctm $ctm"
+
+  pad_frames=0  # this did not seem to be helpful but leaving it as an option.
+  feat-to-len scp:$data_dir/feats.scp ark,t:- >$decode_dir/utt.lengths
+  if [ ! -f $ctm ]; then  echo "$0: expected ctm to exist: $ctm"; exit 1; fi
+
+  cat $ctm | awk '$6 == 1.0 && $4 < 1.0' | \
+  grep -v -w mm | grep -v -w mhm | grep -v -F '[noise]' | \
+  grep -v -F '[laughter]' | grep -v -F '<unk>' | \
+  perl -e ' $lengths=shift @ARGV;  $pad_frames=shift @ARGV; $silence_weight=shift @ARGV;
+   $pad_frames >= 0 || die "bad pad-frames value $pad_frames";
+   open(L, "<$lengths") || die "opening lengths file";
+   @all_utts = ();
+   $utt2ref = { };
+   while (<L>) {
+     ($utt, $len) = split(" ", $_);
+     push @all_utts, $utt;
+     $array_ref = [ ];
+     for ($n = 0; $n < $len; $n++) { ${$array_ref}[$n] = $silence_weight; }
+     $utt2ref{$utt} = $array_ref;
+   }
+   while (<STDIN>) {
+     @A = split(" ", $_);
+     @A == 6 || die "bad ctm line $_";
+     $utt = $A[0]; $beg = $A[2]; $len = $A[3];
+     $beg_int = int($beg * 100) - $pad_frames;
+     $len_int = int($len * 100) + 2*$pad_frames;
+     $array_ref = $utt2ref{$utt};
+     !defined $array_ref  && die "No length info for utterance $utt";
+     for ($t = $beg_int; $t < $beg_int + $len_int; $t++) {
+       if ($t >= 0 && $t < @$array_ref) {
+         ${$array_ref}[$t] = 1;
+        }
+      }
+    }
+    foreach $utt (@all_utts) {  $array_ref = $utt2ref{$utt};
+      print $utt, " [ ", join(" ", @$array_ref), " ]\n";
+      } ' $decode_dir/utt.lengths $pad_frames $silence_weight | \
+        gzip -c > $output_wts_file_gz
+fi
diff --git a/egs/libri_css/s5_mono/local/format_lms.sh b/egs/libri_css/s5_mono/local/format_lms.sh
new file mode 100755
index 00000000000..d1a18bada88
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/format_lms.sh
@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+
+# Copyright 2014 Vassil Panayotov
+# Apache 2.0
+
+# Prepares the test time language model(G) transducers
+# (adapted from wsj/s5/local/wsj_format_data.sh)
+
+. ./path.sh || exit 1;
+
+# begin configuration section
+src_dir=data/lang
+# end configuration section
+
+. utils/parse_options.sh || exit 1;
+
+set -e
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 <lm-dir>"
+  echo "e.g.: $0 /export/a15/vpanayotov/data/lm"
+  echo ", where:"
+  echo "    <lm-dir> is the directory in which the language model is stored/downloaded"
+  echo "Options:"
+  echo "   --src-dir  <dir>           # source lang directory, default data/lang"
+  exit 1
+fi
+
+lm_dir=$1
+
+if [ ! -d $lm_dir ]; then
+  echo "$0: expected source LM directory $lm_dir to exist"
+  exit 1;
+fi
+if [ ! -f $src_dir/words.txt ]; then
+  echo "$0: expected $src_dir/words.txt to exist."
+  exit 1;
+fi
+
+
+tmpdir=data/local/lm_tmp.$$
+trap "rm -r $tmpdir" EXIT
+
+mkdir -p $tmpdir
+
+for lm_suffix in tgsmall tgmed; do
+  # tglarge is prepared by a separate command, called from run.sh; we don't
+  # want to compile G.fst for tglarge, as it takes a while.
+  test=${src_dir}_test_${lm_suffix}
+  mkdir -p $test
+  cp -r ${src_dir}/* $test
+  gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
+  utils/validate_lang.pl --skip-determinization-check $test || exit 1;
+done
+
+echo "Succeeded in formatting data."
+
+exit 0
diff --git a/egs/libri_css/s5_mono/local/get_perspeaker_output.py b/egs/libri_css/s5_mono/local/get_perspeaker_output.py
new file mode 100755
index 00000000000..fcf60f708a2
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/get_perspeaker_output.py
@@ -0,0 +1,91 @@
+#! /usr/bin/env python3
+# Copyright   2020   Desh Raj
+# Apache 2.0.
+"""This script splits a kaldi output (text) file
+  into per_speaker output (text) file"""
+
+import argparse, os
+import itertools
+from collections import defaultdict
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="""This script splits a kaldi text file
+        into per_speaker text files""",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--affix", type=str,
+                        help="Append in front of output file")
+    parser.add_argument("--multi-stream", dest='multi_stream', action='store_true',
+                        default=False,
+                        help="Score with multiple decoding streams e.g. CSS")
+    parser.add_argument("input_text", type=str,
+                        help="path of text file")
+    parser.add_argument("input_utt2spk", type=str,
+                        help="path of utt2spk file")
+    parser.add_argument("output_dir", type=str,
+                        help="Output path for per_session per_speaker reference files")
+    args = parser.parse_args()
+    return args
+
+class Utterance:
+    """Stores all information about an utterance"""
+    reco_id = ''
+    spk_id = ''
+    text = ''
+    start_time = 0
+    end_time = 0
+
+    def __init__(self, uttid, spkid, text, multi_stream):
+        parts = uttid.strip().split('_')
+        self.reco_id = '_'.join(parts[1:4])
+        if not multi_stream:
+            self.start_time = float(parts[4])/100
+            self.end_time = float(parts[5])/100
+        else:
+            self.start_time = float(parts[5])/100
+            self.end_time = float(parts[6])/100            
+        self.spk_id = spkid
+        self.text = text
+
+def main():
+    args = get_args()
+    utt2spk = {}
+    utt_list = []
+
+    # First we read the utt2spk file and create a mapping
+    for line in open(args.input_utt2spk):
+        uttid, spkid = line.strip().split()
+        utt2spk[uttid] = spkid
+
+    # Next we read the input text file and create a list of
+    # Utterance class objects
+    for line in open(args.input_text):
+        parts = line.strip().split(maxsplit=1)
+        uttid = parts[0]
+        text = "" if len(parts) == 1 else parts[1]
+        utterance = Utterance(uttid, utt2spk[uttid], text, args.multi_stream)
+        utt_list.append(utterance)
+
+    groupfn = lambda x: (x.reco_id, x.spk_id)
+    sort(utt_list, key=groupfn)
+    # We group the utterance list into a dictionary indexed by (reco_id, spk_id)
+    reco_spk_to_utts = defaultdict(list,
+        {uid : list(g) for uid, g in itertools.groupby(utt_list, groupfn)})
+    
+    # Now for each (reco_id, spk_id) pair, we write the concatenated text to an
+    # output  (we assign speaker ids 1,2,3,..)
+    for i, uid in enumerate(sorted(reco_spk_to_utts.keys())):
+        reco_id = reco_spk_to_utts[uid][0].reco_id
+        output_file = os.path.join(args.output_dir, '{}_{}_{}_comb'.format(args.affix, i, reco_id))
+        output_writer = open(output_file, 'w')
+        utterances = reco_spk_to_utts[uid]
+
+        # We sort all utterances by start time and concatenate.
+        sorted_utterances = sorted(utterances, key=lambda x: x.start_time)
+        combined_text = ' '.join([utt.text for utt in sorted_utterances])
+
+        output_writer.write("{} {}".format(reco_id, combined_text))
+        output_writer.close()
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/libri_css/s5_mono/local/make_voxceleb1.pl b/egs/libri_css/s5_mono/local/make_voxceleb1.pl
new file mode 100755
index 00000000000..2268c20ab52
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/make_voxceleb1.pl
@@ -0,0 +1,130 @@
+#!/usr/bin/perl
+#
+# Copyright 2018  Ewald Enzinger
+#           2018  David Snyder
+#
+# Usage: make_voxceleb1.pl /export/voxceleb1 data/
+
+if (@ARGV != 2) {
+  print STDERR "Usage: $0 <path-to-voxceleb1> <path-to-data-dir>\n";
+  print STDERR "e.g. $0 /export/voxceleb1 data/\n";
+  exit(1);
+}
+
+($data_base, $out_dir) = @ARGV;
+my $out_test_dir = "$out_dir/voxceleb1_test";
+my $out_train_dir = "$out_dir/voxceleb1_train";
+
+if (system("mkdir -p $out_test_dir") != 0) {
+  die "Error making directory $out_test_dir";
+}
+
+if (system("mkdir -p $out_train_dir") != 0) {
+  die "Error making directory $out_train_dir";
+}
+
+opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!";
+my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh);
+closedir $dh;
+
+if (! -e "$data_base/voxceleb1_test.txt") {
+  system("wget -O $data_base/voxceleb1_test.txt http://www.openslr.org/resources/49/voxceleb1_test.txt");
+}
+
+if (! -e "$data_base/vox1_meta.csv") {
+  system("wget -O $data_base/vox1_meta.csv http://www.openslr.org/resources/49/vox1_meta.csv");
+}
+
+open(TRIAL_IN, "<", "$data_base/voxceleb1_test.txt") or die "Could not open the verification trials file $data_base/voxceleb1_test.txt";
+open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv";
+open(SPKR_TEST, ">", "$out_test_dir/utt2spk") or die "Could not open the output file $out_test_dir/utt2spk";
+open(WAV_TEST, ">", "$out_test_dir/wav.scp") or die "Could not open the output file $out_test_dir/wav.scp";
+open(SPKR_TRAIN, ">", "$out_train_dir/utt2spk") or die "Could not open the output file $out_train_dir/utt2spk";
+open(WAV_TRAIN, ">", "$out_train_dir/wav.scp") or die "Could not open the output file $out_train_dir/wav.scp";
+open(TRIAL_OUT, ">", "$out_test_dir/trials") or die "Could not open the output file $out_test_dir/trials";
+
+my %id2spkr = ();
+while (<META_IN>) {
+  chomp;
+  my ($vox_id, $spkr_id, $gender, $nation, $set) = split;
+  $id2spkr{$vox_id} = $spkr_id;
+}
+
+my $test_spkrs = ();
+while (<TRIAL_IN>) {
+  chomp;
+  my ($tar_or_non, $path1, $path2) = split;
+
+  # Create entry for left-hand side of trial
+  my ($spkr_id, $filename) = split('/', $path1);
+  my $rec_id = substr($filename, 0, 11);
+  my $segment = substr($filename, 12, 7);
+  my $utt_id1 = "$spkr_id-$rec_id-$segment";
+  $test_spkrs{$spkr_id} = ();
+
+  # Create entry for right-hand side of trial
+  my ($spkr_id, $filename) = split('/', $path2);
+  my $rec_id = substr($filename, 0, 11);
+  my $segment = substr($filename, 12, 7);
+  my $utt_id2 = "$spkr_id-$rec_id-$segment";
+  $test_spkrs{$spkr_id} = ();
+
+  my $target = "nontarget";
+  if ($tar_or_non eq "1") {
+    $target = "target";
+  }
+  print TRIAL_OUT "$utt_id1 $utt_id2 $target\n";
+}
+
+foreach (@spkr_dirs) {
+  my $spkr_id = $_;
+  my $new_spkr_id = $spkr_id;
+  # If we're using a newer version of VoxCeleb1, we need to "deanonymize"
+  # the speaker labels.
+  if (exists $id2spkr{$spkr_id}) {
+    $new_spkr_id = $id2spkr{$spkr_id};
+  }
+  opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!";
+  my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
+  closedir $dh;
+  foreach (@files) {
+    my $filename = $_;
+    my $rec_id = substr($filename, 0, 11);
+    my $segment = substr($filename, 12, 7);
+    my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav";
+    my $utt_id = "$new_spkr_id-$rec_id-$segment";
+    if (exists $test_spkrs{$new_spkr_id}) {
+      print WAV_TEST "$utt_id", " $wav", "\n";
+      print SPKR_TEST "$utt_id", " $new_spkr_id", "\n";
+    } else {
+      print WAV_TRAIN "$utt_id", " $wav", "\n";
+      print SPKR_TRAIN "$utt_id", " $new_spkr_id", "\n";
+    }
+  }
+}
+
+close(SPKR_TEST) or die;
+close(WAV_TEST) or die;
+close(SPKR_TRAIN) or die;
+close(WAV_TRAIN) or die;
+close(TRIAL_OUT) or die;
+close(TRIAL_IN) or die;
+close(META_IN) or die;
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_test_dir/utt2spk >$out_test_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_test_dir";
+}
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_test_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_test_dir") != 0) {
+  die "Error validating directory $out_test_dir";
+}
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_train_dir/utt2spk >$out_train_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_train_dir";
+}
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_train_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_train_dir") != 0) {
+  die "Error validating directory $out_train_dir";
+}
diff --git a/egs/libri_css/s5_mono/local/make_voxceleb2.pl b/egs/libri_css/s5_mono/local/make_voxceleb2.pl
new file mode 100755
index 00000000000..34c1591eba3
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/make_voxceleb2.pl
@@ -0,0 +1,70 @@
+#!/usr/bin/perl
+#
+# Copyright 2018  Ewald Enzinger
+#
+# Usage: make_voxceleb2.pl /export/voxceleb2 dev data/dev
+#
+# Note: This script requires ffmpeg to be installed and its location included in $PATH.
+
+if (@ARGV != 3) {
+  print STDERR "Usage: $0 <path-to-voxceleb2> <dataset> <path-to-data-dir>\n";
+  print STDERR "e.g. $0 /export/voxceleb2 dev data/dev\n";
+  exit(1);
+}
+
+# Check that ffmpeg is installed.
+if (`which ffmpeg` eq "") {
+  die "Error: this script requires that ffmpeg is installed.";
+}
+
+($data_base, $dataset, $out_dir) = @ARGV;
+
+if ("$dataset" ne "dev" && "$dataset" ne "test") {
+  die "dataset parameter must be 'dev' or 'test'!";
+}
+
+opendir my $dh, "$data_base/$dataset/aac" or die "Cannot open directory: $!";
+my @spkr_dirs = grep {-d "$data_base/$dataset/aac/$_" && ! /^\.{1,2}$/} readdir($dh);
+closedir $dh;
+
+if (system("mkdir -p $out_dir") != 0) {
+  die "Error making directory $out_dir";
+}
+
+open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
+open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
+
+foreach (@spkr_dirs) {
+  my $spkr_id = $_;
+
+  opendir my $dh, "$data_base/$dataset/aac/$spkr_id/" or die "Cannot open directory: $!";
+  my @rec_dirs = grep {-d "$data_base/$dataset/aac/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
+  closedir $dh;
+
+  foreach (@rec_dirs) {
+    my $rec_id = $_;
+
+    opendir my $dh, "$data_base/$dataset/aac/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
+    my @files = map{s/\.[^.]+$//;$_}grep {/\.m4a$/} readdir($dh);
+    closedir $dh;
+
+    foreach (@files) {
+      my $name = $_;
+      my $wav = "ffmpeg -v 8 -i $data_base/$dataset/aac/$spkr_id/$rec_id/$name.m4a -f wav -acodec pcm_s16le -|";
+      my $utt_id = "$spkr_id-$rec_id-$name";
+      print WAV "$utt_id", " $wav", "\n";
+      print SPKR "$utt_id", " $spkr_id", "\n";
+    }
+  }
+}
+close(SPKR) or die;
+close(WAV) or die;
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_dir";
+}
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+  die "Error validating directory $out_dir";
+}
diff --git a/egs/libri_css/s5_mono/local/multispeaker_score.sh b/egs/libri_css/s5_mono/local/multispeaker_score.sh
new file mode 100755
index 00000000000..676c29f9192
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/multispeaker_score.sh
@@ -0,0 +1,111 @@
+#!/usr/bin/env bash
+# Copyright   2019   Ashish Arora, Yusuke Fujita
+#             2020   Desh Raj
+# Apache 2.0.
+# This script takes a reference and hypothesis text file, and performs 
+# multispeaker scoring.
+
+stage=0
+datadir=
+get_stats=false # TODO: Implement 'true' (i.e. per utterance alignment of output)
+multistream=false # Set to true if input audio was separated (e.g. CSS)
+
+multistream_opt=
+if [ $multistream == "true" ]; then
+  multistream_opt="--multi-stream"
+fi
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <ref-file> <hyp-file> <out-dir>"
+  echo "e.g.: $0 data/diarized/text data/dev \
+    exp/chain_cleaned/tdnn_1d_sp/decode_dev_diarized/scoring_kaldi/penalty_1.0/10.txt \
+    exp/chain_cleaned/tdnn_1d_sp/decode_dev_diarized/scoring_kaldi_multispeaker"
+  exit 1;
+fi
+
+ref_file=$1
+hyp_file=$2
+out_dir=$3
+
+output_dir=$out_dir/per_speaker_output
+wer_dir=$out_dir/per_speaker_wer
+
+if [ $multistream ]; then
+  recording_ids=( $(awk '{$1=$1;sub(/_[0-9]*$/, "", $1); print $1}' data/$datadir/wav.scp | sort -u) )
+else
+  recording_ids=( $(awk '{print $1}' data/$datadir/wav.scp) )
+fi
+
+for f in $ref_file $hyp_file; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+if [ $stage -le 0 ]; then
+  # generate per speaker per recording files for reference and hypothesis
+  mkdir -p $output_dir $wer_dir
+  local/wer_output_filter < $ref_file > $output_dir/ref_filt.txt
+  local/wer_output_filter < $hyp_file > $output_dir/hyp_filt.txt
+  local/get_perspeaker_output.py --affix "ref" $output_dir/ref_filt.txt data/$datadir/utt2spk.bak $output_dir
+  local/get_perspeaker_output.py --affix "hyp" $multistream_opt $output_dir/hyp_filt.txt data/$datadir/utt2spk $output_dir
+fi
+
+if [ $stage -le 1 ]; then
+  # Now for each recording, we score all pairs of ref/hyp speaker outputs
+  for reco_id in "${recording_ids[@]}"; do
+    # Get list of ref files
+    reco_ref_files=( $( ls $output_dir/ref* | grep $reco_id ) )
+    # Get list of hyp files
+    reco_hyp_files=( $( ls $output_dir/hyp* | grep $reco_id ) )
+    for reco_ref in "${reco_ref_files[@]}"; do
+      for reco_hyp in "${reco_hyp_files[@]}"; do
+        ref_spkid=$( basename "$reco_ref" | cut -d'_' -f2 )
+        hyp_spkid=$( basename "$reco_hyp" | cut -d'_' -f2 )
+        # compute WER with combined texts
+        compute-wer --text --mode=present ark:$reco_ref ark:$reco_hyp \
+          > $wer_dir/wer_${reco_id}_r${ref_spkid}h${hyp_spkid} 2>/dev/null
+      done
+    done
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  for reco_id in "${recording_ids[@]}"; do
+    # For each recording, we create a summary file of all permutations
+    >$wer_dir/summary_$reco_id
+    reco_wer_files=( $( ls $wer_dir/wer_* | grep $reco_id ) )
+    for reco_wer in "${reco_wer_files[@]}"; do
+      ref_hyp_spkid=$( basename "$reco_wer" | cut -d'_' -f5 )
+      cur_wer=$( head -1 $reco_wer )
+      printf "$ref_hyp_spkid %s\n" "${cur_wer}" >> $wer_dir/summary_$reco_id
+    done
+
+    # Now we get best wer for each recording id
+    cat $wer_dir/summary_$reco_id \
+      | local/best_wer_matching.py \
+      > $wer_dir/best_wer_$reco_id
+ 
+  done
+  rm $wer_dir/best_wer_all 2> /dev/null
+  awk '
+  function basename(file, a, n) {
+    n = split(file, a, "/")
+    return a[n]
+  }
+  {printf "%s %s\n", basename(FILENAME), $0}' $wer_dir/best_wer_* > $wer_dir/best_wer_all
+fi
+
+# Also compute the average WER stats over all conditions. This will be used
+# for LMWT and WIP selection.
+if [ $stage -le 3 ]; then
+  cat $wer_dir/best_wer_all | sed 's/,//g' | awk '
+    {
+      ERR+=$5; WC+=$7; INS+=$8; DEL+=$10; SUB+=$12;
+    }END{
+      WER=ERR*100/WC;
+      printf("%%WER %.2f [ %d / %d, %d ins, %d del, %d sub ]",WER,ERR,WC,INS,DEL,SUB);
+    }
+    ' > $wer_dir/best_wer_average
+fi
diff --git a/egs/libri_css/s5_mono/local/nnet3/decode.sh b/egs/libri_css/s5_mono/local/nnet3/decode.sh
new file mode 100755
index 00000000000..795fec459b9
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/nnet3/decode.sh
@@ -0,0 +1,163 @@
+#!/usr/bin/env bash
+
+# Copyright 2016 Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti)
+#           2019 Vimal Manohar 
+# Apache 2.0.
+
+# This script does 2-stage decoding where the first stage is used to get 
+# reliable frames for i-vector extraction.
+
+set -e
+
+# general opts
+iter=
+stage=0
+nj=30
+affix=  # affix for decode directory
+
+# ivector opts
+max_count=75  # parameter for extract_ivectors.sh
+sub_speaker_frames=6000
+ivector_scale=0.75
+get_weights_from_ctm=true
+weights_file=   # use weights from this archive (must be compressed using gunzip)
+silence_weight=0.00001   # apply this weight to silence frames during i-vector extraction
+ivector_dir=exp/nnet3
+
+# decode opts
+pass2_decode_opts="--min-active 1000"
+lattice_beam=8
+extra_left_context=0 # change for (B)LSTM
+extra_right_context=0 # change for BLSTM
+frames_per_chunk=50 # change for (B)LSTM
+acwt=0.1 # important to change this when using chain models
+post_decode_acwt=1.0 # important to change this when using chain models
+extra_left_context_initial=0
+extra_right_context_final=0
+
+graph_affix=
+
+score_opts="--min-lmwt 6 --max-lmwt 13"
+
+. ./cmd.sh
+[ -f ./path.sh ] && . ./path.sh
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 [options] <data-dir> <lang-dir> <graph-dir> <model-dir>"
+  echo " Options:"
+  echo "    --stage (0|1|2)   # start scoring script from part-way through."
+  echo "e.g.:"
+  echo "$0 data/dev data/lang exp/tri5a/graph_pp exp/nnet3/tdnn"
+  exit 1;
+fi
+
+data=$1 # data directory 
+lang=$2 # data/lang
+graph=$3 #exp/tri5a/graph_pp
+dir=$4 # exp/nnet3/tdnn
+
+model_affix=`basename $dir`
+ivector_affix=${affix:+_$affix}_chain_${model_affix}${iter:+_iter$iter}
+affix=${affix:+_${affix}}${iter:+_iter${iter}}
+
+if [ $stage -le 1 ]; then
+  if [ ! -s ${data}_hires/feats.scp ]; then
+    utils/copy_data_dir.sh $data ${data}_hires
+    steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd "$train_cmd" ${data}_hires
+    steps/compute_cmvn_stats.sh ${data}_hires
+  fi
+fi
+
+data_set=$(basename $data)
+if [ $stage -le 2 ]; then
+  echo "Extracting i-vectors, stage 1"
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    --max-count $max_count \
+    ${data}_hires $ivector_dir/extractor \
+    $ivector_dir/ivectors_${data_set}${ivector_affix}_stage1;
+  # float comparisons are hard in bash
+  if [ `bc <<< "$ivector_scale != 1"` -eq 1 ]; then
+    ivector_scale_affix=_scale$ivector_scale
+  else
+    ivector_scale_affix=
+  fi
+
+  if [ ! -z "$ivector_scale_affix" ]; then
+    echo "$0: Scaling iVectors, stage 1"
+    srcdir=$ivector_dir/ivectors_${data_set}${ivector_affix}_stage1
+    outdir=$ivector_dir/ivectors_${data_set}${ivector_affix}${ivector_scale_affix}_stage1
+    mkdir -p $outdir
+    $train_cmd $outdir/log/scale_ivectors.log \
+      copy-matrix --scale=$ivector_scale scp:$srcdir/ivector_online.scp ark:- \| \
+      copy-feats --compress=true ark:-  ark,scp:$outdir/ivector_online.ark,$outdir/ivector_online.scp;
+    cp $srcdir/ivector_period $outdir/ivector_period
+  fi
+fi
+
+decode_dir=$dir/decode${graph_affix}_${data_set}${affix}
+# generate the lattices
+if [ $stage -le 3 ]; then
+  echo "Generating lattices, stage 1"
+  steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" \
+    --acwt $acwt --post-decode-acwt $post_decode_acwt \
+    --extra-left-context $extra_left_context  \
+    --extra-right-context $extra_right_context  \
+    --extra-left-context-initial $extra_left_context_initial \
+    --extra-right-context-final $extra_right_context_final \
+    --frames-per-chunk "$frames_per_chunk" \
+    --online-ivector-dir $ivector_dir/ivectors_${data_set}${ivector_affix}${ivector_scale_affix}_stage1 \
+    --skip-scoring true ${iter:+--iter $iter} \
+    $graph ${data}_hires ${decode_dir}_stage1;
+fi
+
+if [ $stage -le 4 ]; then
+  if $get_weights_from_ctm; then
+    if [ ! -z $weights_file ]; then
+      echo "$0: Using provided vad weights file $weights_file"
+      ivector_extractor_weights=$weights_file
+    else
+      echo "$0 : Generating vad weights file"
+      ivector_extractor_weights=${decode_dir}_stage1/weights${affix}.gz
+      local/extract_vad_weights.sh --silence-weight $silence_weight \
+        --cmd "$decode_cmd" ${iter:+--iter $iter} \
+        ${data}_hires $lang \
+        ${decode_dir}_stage1 $ivector_extractor_weights
+    fi
+  else
+    # get weights from best path decoding
+    ivector_extractor_weights=${decode_dir}_stage1
+  fi
+fi
+
+if [ $stage -le 5 ]; then
+  echo "Extracting i-vectors, stage 2 with weights from $ivector_extractor_weights"
+  # this does offline decoding, except we estimate the iVectors per
+  # speaker, excluding silence (based on alignments from a DNN decoding), with a
+  # different script.  This is just to demonstrate that script.
+  # the --sub-speaker-frames is optional; if provided, it will divide each speaker
+  # up into "sub-speakers" of at least that many frames... can be useful if
+  # acoustic conditions drift over time within the speaker's data.
+  steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \
+    --silence-weight $silence_weight \
+    --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
+    ${data}_hires $lang $ivector_dir/extractor \
+    $ivector_extractor_weights $ivector_dir/ivectors_${data_set}${ivector_affix};
+fi
+
+if [ $stage -le 6 ]; then
+  echo "Generating lattices, stage 2 with --acwt $acwt"
+  rm -f ${decode_dir}/.error
+  steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" $pass2_decode_opts \
+      --acwt $acwt --post-decode-acwt $post_decode_acwt \
+      --extra-left-context $extra_left_context  \
+      --extra-right-context $extra_right_context  \
+      --extra-left-context-initial $extra_left_context_initial \
+      --extra-right-context-final $extra_right_context_final \
+      --frames-per-chunk "$frames_per_chunk" \
+      --skip-scoring false ${iter:+--iter $iter} --lattice-beam $lattice_beam \
+      --online-ivector-dir $ivector_dir/ivectors_${data_set}${ivector_affix} \
+     $graph ${data}_hires ${decode_dir} || touch ${decode_dir}/.error
+  [ -f ${decode_dir}/.error ] && echo "$0: Error decoding" && exit 1;
+fi
+exit 0
diff --git a/egs/libri_css/s5_mono/local/nnet3/run_ivector_common.sh b/egs/libri_css/s5_mono/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..26653ccbd5c
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,149 @@
+#!/usr/bin/env bash
+
+set -e -o pipefail
+
+
+# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually
+# be called by more scripts).  It contains the common feature preparation and iVector-related parts
+# of the script.  See those scripts for examples of usage.
+
+
+stage=0
+train_set=train_960_cleaned    # you might set this to e.g. train_960
+gmm=tri6b_cleaned         # This specifies a GMM-dir from the features of the type you're training the system on;
+                         # it should contain alignments for 'train_set'.
+num_threads_ubm=16
+num_processes=4
+nnet3_affix=_cleaned     # affix for exp/nnet3 directory to put iVector stuff in, so it
+                         # becomes exp/nnet3_cleaned or whatever.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  #Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment.  _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 data/${train_set}_sp || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
+  echo "$0: fixing input data-dir to remove nonexistent features, in case some "
+  echo ".. speed-perturbed segments were too short."
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/mfcc/librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
+
+  for datadir in ${train_set}_sp; do
+    steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires
+  done
+
+  # now create a data subset.  60k is 1/5th of the training dataset (around 200 hours).
+  utils/subset_data_dir.sh data/${train_set}_sp_hires 60000 data/${train_set}_sp_hires_60k
+fi
+
+
+if [ $stage -le 4 ]; then
+  echo "$0: making a subset of data to train the diagonal UBM and the PCA transform."
+  # We'll one hundredth of the data, since Librispeech is very large.
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/100]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+     $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads $num_threads_ubm \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+
+if [ $stage -le 5 ]; then
+  # iVector extractors can in general be sensitive to the amount of data, but
+  # this one has a fairly small dim (defaults to 100) so we don't use all of it,
+  # we use just the 60k subset (about one fifth of the data, or 200 hours).
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 --num-processes $num_processes \
+    data/${train_set}_sp_hires_60k exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: extracting iVectors for training data"
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b{09,10,11,12}/$USER/kaldi-data/ivectors/librispeech-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker. this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${ivectordir}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
+    ${ivectordir}/${train_set}_sp_hires_max2 exp/nnet3${nnet3_affix}/extractor \
+    $ivectordir || exit 1;
+fi
+
+exit 0;
diff --git a/egs/libri_css/s5_mono/local/nnet3/xvector/prepare_feats.sh b/egs/libri_css/s5_mono/local/nnet3/xvector/prepare_feats.sh
new file mode 100755
index 00000000000..6b5ccd466c3
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/nnet3/xvector/prepare_feats.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+#
+# Apache 2.0.
+
+# This script applies sliding window CMVN and writes the features to disk.
+#
+# Although this kind of script isn't necessary in speaker recognition recipes,
+# it can be helpful in the diarization recipes.  The script
+# diarization/nnet3/xvector/extract_xvectors.sh extracts x-vectors from very
+# short (e.g., 1-2 seconds) segments.  Therefore, in order to apply the sliding
+# window CMVN in a meaningful way, it must be performed prior to performing
+# the subsegmentation.
+
+nj=40
+cmd="run.pl"
+stage=0
+norm_vars=false
+center=true
+compress=true
+cmn_window=300
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+if [ $# != 3 ]; then
+  echo "Usage: $0 <in-data-dir> <out-data-dir> <feat-dir>"
+  echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --norm-vars <true|false>                         # If true, normalize variances in the sliding window cmvn"
+  exit 1;
+fi
+
+data_in=$1
+data_out=$2
+dir=$3
+
+name=`basename $data_in`
+
+for f in $data_in/feats.scp ; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+# Set various variables.
+mkdir -p $dir/log
+mkdir -p $data_out
+featdir=$(utils/make_absolute.sh $dir)
+
+if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then
+  utils/create_split_dir.pl \
+    /export/b{14,15,16,17}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_cmvn_feats/storage $featdir/storage
+fi
+
+for n in $(seq $nj); do
+  # the next command does nothing unless $featdir/storage/ exists, see
+  # utils/create_data_link.pl for more info.
+  utils/create_data_link.pl $featdir/xvector_cmvn_feats_${name}.${n}.ark
+done
+
+cp $data_in/utt2spk $data_out/utt2spk
+cp $data_in/spk2utt $data_out/spk2utt
+cp $data_in/wav.scp $data_out/wav.scp
+for f in $data_in/segments $data_in/segments/vad.scp ; do
+  [ -f $f ] && cp $f $data_out/`basename $f`;
+done
+
+write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB"
+
+sdata_in=$data_in/split$nj;
+utils/split_data.sh $data_in $nj || exit 1;
+
+$cmd JOB=1:$nj $dir/log/create_xvector_cmvn_feats_${name}.JOB.log \
+  apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \
+  scp:${sdata_in}/JOB/feats.scp ark:- \| \
+  copy-feats --compress=$compress $write_num_frames_opt ark:- \
+  ark,scp:$featdir/xvector_cmvn_feats_${name}.JOB.ark,$featdir/xvector_cmvn_feats_${name}.JOB.scp || exit 1;
+
+for n in $(seq $nj); do
+  cat $featdir/xvector_cmvn_feats_${name}.$n.scp || exit 1;
+done > ${data_out}/feats.scp || exit 1
+
+for n in $(seq $nj); do
+  cat $featdir/log/utt2num_frames.$n || exit 1;
+done > $data_out/utt2num_frames || exit 1
+rm $featdir/log/utt2num_frames.*
+
+echo "$0: Succeeded creating xvector features for $name"
diff --git a/egs/libri_css/s5_mono/local/nnet3/xvector/prepare_feats_for_egs.sh b/egs/libri_css/s5_mono/local/nnet3/xvector/prepare_feats_for_egs.sh
new file mode 100755
index 00000000000..326b6dbb9fa
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/nnet3/xvector/prepare_feats_for_egs.sh
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+#
+# Apache 2.0.
+
+# This script applies sliding window CMVN and removes silence frames.  This
+# is performed on the raw features prior to generating examples for training
+# the x-vector system.  Once the training examples are generated, the features
+# created by this script can be removed.
+
+nj=40
+cmd="run.pl"
+stage=0
+norm_vars=false
+center=true
+compress=true
+cmn_window=300
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+if [ $# != 3 ]; then
+  echo "Usage: $0 <in-data-dir> <out-data-dir> <feat-dir>"
+  echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --norm-vars <true|false>                         # If true, normalize variances in the sliding window cmvn"
+  exit 1;
+fi
+
+data_in=$1
+data_out=$2
+dir=$3
+
+name=`basename $data_in`
+
+for f in $data_in/feats.scp $data_in/vad.scp ; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+# Set various variables.
+mkdir -p $dir/log
+mkdir -p $data_out
+featdir=$(utils/make_absolute.sh $dir)
+
+if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then
+  utils/create_split_dir.pl \
+    /export/b{14,15,16,17}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_feats/storage $featdir/storage
+fi
+
+for n in $(seq $nj); do
+  # the next command does nothing unless $featdir/storage/ exists, see
+  # utils/create_data_link.pl for more info.
+  utils/create_data_link.pl $featdir/xvector_feats_${name}.${n}.ark
+done
+
+cp $data_in/utt2spk $data_out/utt2spk
+cp $data_in/spk2utt $data_out/spk2utt
+cp $data_in/wav.scp $data_out/wav.scp
+
+write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB"
+
+sdata_in=$data_in/split$nj;
+utils/split_data.sh $data_in $nj || exit 1;
+
+$cmd JOB=1:$nj $dir/log/create_xvector_feats_${name}.JOB.log \
+  apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \
+  scp:${sdata_in}/JOB/feats.scp ark:- \| \
+  select-voiced-frames ark:- scp,s,cs:${sdata_in}/JOB/vad.scp ark:- \| \
+  copy-feats --compress=$compress $write_num_frames_opt ark:- \
+  ark,scp:$featdir/xvector_feats_${name}.JOB.ark,$featdir/xvector_feats_${name}.JOB.scp || exit 1;
+
+for n in $(seq $nj); do
+  cat $featdir/xvector_feats_${name}.$n.scp || exit 1;
+done > ${data_out}/feats.scp || exit 1
+
+for n in $(seq $nj); do
+  cat $featdir/log/utt2num_frames.$n || exit 1;
+done > $data_out/utt2num_frames || exit 1
+rm $featdir/log/utt2num_frames.*
+
+echo "$0: Succeeded creating xvector features for $name"
diff --git a/egs/libri_css/s5_mono/local/nnet3/xvector/run_xvector.sh b/egs/libri_css/s5_mono/local/nnet3/xvector/run_xvector.sh
new file mode 120000
index 00000000000..585b63fd2dd
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/nnet3/xvector/run_xvector.sh
@@ -0,0 +1 @@
+tuning/run_xvector_1a.sh
\ No newline at end of file
diff --git a/egs/libri_css/s5_mono/local/nnet3/xvector/tuning/run_xvector_1a.sh b/egs/libri_css/s5_mono/local/nnet3/xvector/tuning/run_xvector_1a.sh
new file mode 100755
index 00000000000..2189e406a7e
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/nnet3/xvector/tuning/run_xvector_1a.sh
@@ -0,0 +1,149 @@
+#!/usr/bin/env bash
+# Copyright      2018   David Snyder
+#                2018   Johns Hopkins University (Author: Daniel Garcia-Romero)
+#                2018   Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# This script trains the x-vector DNN.  The recipe is similar to the one
+# described in "Diarization is Hard: Some Experiences and Lessons Learned
+# for the JHU Team in the Inaugural DIHARD Challenge" by Sell et al.
+
+. ./cmd.sh
+set -e
+
+stage=1
+train_stage=-1
+use_gpu=true
+remove_egs=false
+
+data=data/train
+nnet_dir=exp/xvector_nnet_1a/
+egs_dir=exp/xvector_nnet_1a/egs
+
+. ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+num_pdfs=$(awk '{print $2}' $data/utt2spk | sort | uniq -c | wc -l)
+
+# Now we create the nnet examples using sid/nnet3/xvector/get_egs.sh.
+# The argument --num-repeats is related to the number of times a speaker
+# repeats per archive.  If it seems like you're getting too many archives
+# (e.g., more than 200) try increasing the --frames-per-iter option.  The
+# arguments --min-frames-per-chunk and --max-frames-per-chunk specify the
+# minimum and maximum length (in terms of number of frames) of the features
+# in the examples.
+#
+# To make sense of the egs script, it may be necessary to put an "exit 1"
+# command immediately after stage 3.  Then, inspect
+# exp/<your-dir>/egs/temp/ranges.* . The ranges files specify the examples that
+# will be created, and which archives they will be stored in.  Each line of
+# ranges.* has the following form:
+#    <utt-id> <local-ark-indx> <global-ark-indx> <start-frame> <end-frame> <spk-id>
+# For example:
+#    100304-f-sre2006-kacg-A 1 2 4079 881 23
+
+# If you're satisfied with the number of archives (e.g., 50-150 archives is
+# reasonable) and with the number of examples per speaker (e.g., 1000-5000
+# is reasonable) then you can let the script continue to the later stages.
+# Otherwise, try increasing or decreasing the --num-repeats option.  You might
+# need to fiddle with --frames-per-iter.  Increasing this value decreases the
+# the number of archives and increases the number of examples per archive.
+# Decreasing this value increases the number of archives, while decreasing the
+# number of examples per archive.
+if [ $stage -le 6 ]; then
+  echo "$0: Getting neural network training egs";
+  # dump egs.
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b{03,04,05,06}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/$egs_dir/storage $egs_dir/storage
+  fi
+  sid/nnet3/xvector/get_egs.sh --cmd "$train_cmd" \
+    --nj 8 \
+    --stage 0 \
+    --frames-per-iter 1000000000 \
+    --frames-per-iter-diagnostic 500000 \
+    --min-frames-per-chunk 200 \
+    --max-frames-per-chunk 400 \
+    --num-diagnostic-archives 3 \
+    --num-repeats 40 \
+    "$data" $egs_dir
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(wc -w $egs_dir/pdf2num | awk '{print $1}')
+  feat_dim=$(cat $egs_dir/info/feat_dim)
+
+  # This chunk-size corresponds to the maximum number of frames the
+  # stats layer is able to pool over.  In this script, it corresponds
+  # to 4 seconds.  If the input recording is greater than 4 seconds,
+  # we will compute multiple xvectors from the same recording and average
+  # to produce the final xvector.
+  max_chunk_size=400
+
+  # The smallest number of frames we're comfortable computing an xvector from.
+  # Note that the hard minimum is given by the left and right context of the
+  # frame-level layers.
+  min_chunk_size=20
+  mkdir -p $nnet_dir/configs
+  cat <<EOF > $nnet_dir/configs/network.xconfig
+  # please note that it is important to have input layer with the name=input
+
+  # The frame-level layers
+  input dim=${feat_dim} name=input
+  relu-batchnorm-layer name=tdnn1 input=Append(-2,-1,0,1,2) dim=512
+  relu-batchnorm-layer name=tdnn2 input=Append(-2,0,2) dim=512
+  relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=512
+  relu-batchnorm-layer name=tdnn4 dim=512
+  relu-batchnorm-layer name=tdnn5 dim=1500
+
+  # The stats pooling layer. Layers after this are segment-level.
+  # In the config below, the first and last argument (0, and ${max_chunk_size})
+  # means that we pool over an input segment starting at frame 0
+  # and ending at frame ${max_chunk_size} or earlier.  The other arguments (1:1)
+  # mean that no subsampling is performed.
+  stats-layer name=stats config=mean+stddev(0:1:1:${max_chunk_size})
+
+  # This is where we usually extract the embedding (aka xvector) from.
+  relu-batchnorm-layer name=tdnn6 dim=128 input=stats
+  output-layer name=output include-log-softmax=true dim=${num_targets}
+EOF
+
+  steps/nnet3/xconfig_to_configs.py \
+      --xconfig-file $nnet_dir/configs/network.xconfig \
+      --config-dir $nnet_dir/configs/
+  cp $nnet_dir/configs/final.config $nnet_dir/nnet.config
+
+  # These three files will be used by sid/nnet3/xvector/extract_xvectors.sh
+  echo "output-node name=output input=tdnn6.affine" > $nnet_dir/extract.config
+  echo "$max_chunk_size" > $nnet_dir/max_chunk_size
+  echo "$min_chunk_size" > $nnet_dir/min_chunk_size
+fi
+
+dropout_schedule='0,0@0.20,0.1@0.50,0'
+srand=123
+if [ $stage -le 8 ]; then
+  steps/nnet3/train_raw_dnn.py --stage=$train_stage \
+    --cmd="$train_cmd" \
+    --trainer.optimization.proportional-shrink 10 \
+    --trainer.optimization.momentum=0.5 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=8 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.minibatch-size=64 \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2 \
+    --trainer.num-epochs=3 \
+    --trainer.dropout-schedule="$dropout_schedule" \
+    --trainer.shuffle-buffer-size=1000 \
+    --egs.frames-per-eg=1 \
+    --egs.dir="$egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval=10 \
+    --use-gpu=true \
+    --dir=$nnet_dir  || exit 1;
+fi
+
+exit 0;
diff --git a/egs/libri_css/s5_mono/local/prepare_data.py b/egs/libri_css/s5_mono/local/prepare_data.py
new file mode 100755
index 00000000000..3d5b622ab30
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/prepare_data.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Copyright  2020  Johns Hopkins University (Author: Desh Raj)
+# Apache 2.0
+
+import argparse, os, glob, tqdm, zipfile
+import subprocess
+
+def write_dict_to_file(utt2data, file_path):
+    f = open(file_path, 'w')
+    for utt in utt2data.keys():
+        f.write('{} {}\n'.format(utt, utt2data[utt]))
+    f.close()
+    return
+
+def main(args):
+    os.makedirs(args.tgtpath, exist_ok=True)
+
+    # Dictionary to store all info that we will write to files after 
+    # reading all files.
+    reco2wav = {} # for wav.scp
+    reco2segments = {} # for segments
+    utt2spk = {} # for utt2spk
+    utt2text = {} # for text
+    print ("Creating dictionary of all clean LibriSpeech utterances")
+    if (args.cleanpath):
+        utt2clean = {} # path to clean utt wav file
+        command = 'find %s -name "*.flac"' % (args.cleanpath)
+        wavs = subprocess.check_output(command, shell=True).decode('utf-8').splitlines()
+        keys = [ os.path.splitext(os.path.basename(wav))[0] for wav in wavs ]
+        clean_paths = {key:wav for key,wav in zip(keys,wavs)}
+
+    # Create a directory to store channel-separated wav files
+    wav_dir = os.path.join(args.tgtpath,'wavs')
+    os.makedirs(wav_dir, exist_ok=True)
+
+    conditions = ('0L','0S','OV10','OV20','OV30','OV40')
+    for cond in tqdm.tqdm(conditions):
+        meeting = glob.glob(os.path.join(args.srcpath, cond, 'overlap*'))
+        for meet in meeting:
+            # Extract the signals of the selected microphones. 
+            meeting_name = os.path.basename(meet)
+            _,_,_,_,_,sessid,olr = meeting_name.split('_')
+
+            wav_path = os.path.join(os.path.abspath(meet), 'record', 'raw_recording.wav')
+            for mic in args.mics:
+                reco_id = "{}_CH{}_{}".format(sessid, mic, cond) # Session0_CH1_0L
+                new_wav_path = "sox {} -t wav - remix {} |".format(wav_path, mic+1) # channel will be extracted on the fly
+                reco2wav[reco_id] = new_wav_path
+            
+            segments = []
+            with open(os.path.join(os.path.abspath(meet), 'transcription', 'meeting_info.txt'), 'r') as f:
+                next(f)
+                for line in f:
+                    start,end,spkid,clean_uttid,text = line.strip().split(maxsplit=4)
+                    start = float("{:.2f}".format(float(start)))
+                    end = float("{:.2f}".format(float(end)))
+                    utt_id = "{}_{}_{}_{}".format(spkid,reco_id,"{:.0f}".format(100*start).zfill(6),
+                        "{:.0f}".format(100*end).zfill(6)) # 6930_Session0_CH1_0L_000853_002463
+                    utt2spk[utt_id] = spkid
+                    utt2text[utt_id] = text
+                    segments.append((utt_id, start, end))
+                    if args.cleanpath:
+                        utt2clean[utt_id] = "sox {} -t wav - |".format(clean_paths[clean_uttid])
+            
+            reco2segments[reco_id] = segments
+    
+    # Write all dictionaries to respective files
+    write_dict_to_file(reco2wav, os.path.join(args.tgtpath, 'wav.scp'))
+    write_dict_to_file(utt2spk, os.path.join(args.tgtpath, 'utt2spk'))
+    write_dict_to_file(utt2text, os.path.join(args.tgtpath, 'text'))
+    write_dict_to_file(utt2clean, os.path.join(args.tgtpath, "wav_clean.scp"))
+
+    f = open(os.path.join(args.tgtpath, 'segments'), 'w')
+    for reco in reco2segments.keys():
+        segments = reco2segments[reco]
+        for segment in segments:
+            f.write('{} {} {} {}\n'.format(segment[0], reco, segment[1], segment[2]))
+    f.close()
+
+
+
+def make_argparse():
+    parser = argparse.ArgumentParser(description='Reorganize LibriCSS data into Kaldi format.')
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--srcpath', metavar='<path>', required=True, 
+                        help='Original LibriCSS data path.')
+    parser.add_argument('--tgtpath', metavar='<path>', required=True, 
+                        help='Destination path.')
+    parser.add_argument('--mics', type=int, metavar='<#mics>', nargs='+', default=[0, 1, 2, 3, 4, 5, 6], 
+                        help='Microphone indices.')
+    parser.add_argument('--cleanpath', metavar='<path>', required=False,
+                        help='Path to clean Librispeech data (required for wav_clean.scp)')
+
+    return parser
+
+
+
+if __name__ == '__main__':
+    parser = make_argparse()
+    args = parser.parse_args()
+    main(args)
diff --git a/egs/libri_css/s5_mono/local/prepare_data_css.py b/egs/libri_css/s5_mono/local/prepare_data_css.py
new file mode 100755
index 00000000000..7a87e80e30d
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/prepare_data_css.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Copyright  2020  Johns Hopkins University (Author: Desh Raj)
+# Apache 2.0
+
+import argparse, os, glob, tqdm, zipfile, pathlib
+
+def write_dict_to_file(utt2data, file_path):
+    f = open(file_path, 'w')
+    for utt in utt2data.keys():
+        f.write('{} {}\n'.format(utt, utt2data[utt]))
+    f.close()
+    return
+
+def main(args):
+    os.makedirs(args.tgtpath, exist_ok=True)
+
+    # Dictionary to store all info that we will write to files after 
+    # reading all files.
+    reco2wav = {} # for wav.scp
+    reco2segments = {} # for segments
+    utt2spk = {} # for utt2spk
+    utt2text = {} # for text
+
+    # First we create reco2wav from the separated wav files
+    wavs = os.listdir(args.wav_path)
+    for wav in wavs:
+        path = os.path.join(args.wav_path, wav)
+        _,_,olr,_,sil_max,sessid,_,_,stream = pathlib.Path(path).stem.split('_')
+        cond = "OV{}".format(int(float(olr)))
+        if (float(olr) == 0):
+            if (sil_max == '0.5'):
+                cond = "0S"
+            else:
+                cond = "0L"
+        wav_name = "{}_CH0_{}_{}".format(sessid, cond, stream) # session0_CH0_0L_1
+        reco2wav[wav_name] = path
+        if (args.volume != 1):
+            reco2wav[wav_name] = "sox -v {} -t wav {} -t wav - |".format(args.volume, path) 
+
+    
+    # Now we get other info from the original LibriCSS corpus dir
+    conditions = ('0L','0S','OV10','OV20','OV30','OV40')   
+    for cond in tqdm.tqdm(conditions):
+        meeting = glob.glob(os.path.join(args.srcpath, cond, 'overlap*'))
+        for meet in meeting:
+            segments = []
+            _,_,_,_,_,sessid,_ = os.path.basename(meet).split('_')
+            reco_id = "{}_CH0_{}".format(sessid, cond) # session0_CH0_0L
+            with open(os.path.join(os.path.abspath(meet), 'transcription', 'meeting_info.txt'), 'r') as f:
+                next(f)
+                for line in f:
+                    start,end,spkid,_,text = line.strip().split(maxsplit=4)
+                    start = float("{:.2f}".format(float(start)))
+                    end = float("{:.2f}".format(float(end)))
+                    utt_id = "{}_{}_{}_{}".format(spkid,reco_id,"{:.0f}".format(100*start).zfill(6),
+                        "{:.0f}".format(100*end).zfill(6)) # 6930_Session0_CH1_0L_000853_002463
+                    utt2spk[utt_id] = spkid
+                    utt2text[utt_id] = text
+                    segments.append((utt_id, start, end))
+            
+            reco2segments[reco_id] = segments
+    
+    # Write all dictionaries to respective files
+    write_dict_to_file(reco2wav, os.path.join(args.tgtpath, 'wav.scp'))
+    write_dict_to_file(utt2spk, os.path.join(args.tgtpath, 'utt2spk'))
+    write_dict_to_file(utt2text, os.path.join(args.tgtpath, 'text'))
+
+    f = open(os.path.join(args.tgtpath, 'segments'), 'w')
+    for reco in reco2segments.keys():
+        segments = reco2segments[reco]
+        for segment in segments:
+            f.write('{} {} {} {}\n'.format(segment[0], reco, segment[1], segment[2]))
+    f.close()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Reorganize LibriCSS data into Kaldi format.'
+        ' Additionally, use separated wav files.')
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--srcpath', metavar='<path>', required=True, 
+                        help='Original LibriCSS data path.')
+    parser.add_argument('--wav-path', metavar='<path>', required=True, 
+                        help='Path to directory containing separated wavs.')
+    parser.add_argument('--tgtpath', metavar='<path>', required=True, 
+                        help='Destination path.')
+    parser.add_argument('--volume', default=1, type=float, help='sox -v option')
+
+    args = parser.parse_args()
+    main(args)
diff --git a/egs/libri_css/s5_mono/local/prepare_dict.sh b/egs/libri_css/s5_mono/local/prepare_dict.sh
new file mode 100755
index 00000000000..7b345b6bf1c
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/prepare_dict.sh
@@ -0,0 +1,143 @@
+#!/usr/bin/env bash
+
+# Copyright 2014 Vassil Panayotov
+# Apache 2.0
+
+# Prepares the dictionary and auto-generates the pronunciations for the words,
+# that are in our vocabulary but not in CMUdict
+
+stage=0
+nj=4 # number of parallel Sequitur G2P jobs, we would like to use
+cmd=run.pl
+
+
+. utils/parse_options.sh || exit 1;
+. ./path.sh || exit 1
+
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [options] <lm-dir> <g2p-model-dir> <dst-dir>"
+  echo "e.g.: /export/a15/vpanayotov/data/lm /export/a15/vpanayotov/data/g2p data/local/dict"
+  echo "Options:"
+  echo "  --cmd '<command>'    # script to launch jobs with, default: run.pl"
+  echo "  --nj <nj>            # number of jobs to run, default: 4."
+  exit 1
+fi
+
+lm_dir=$1
+g2p_model_dir=$2
+dst_dir=$3
+
+vocab=$lm_dir/librispeech-vocab.txt
+[ ! -f $vocab ] && echo "$0: vocabulary file not found at $vocab" && exit 1;
+
+# this file is either a copy of the lexicon we download from openslr.org/11 or is
+# created by the G2P steps below
+lexicon_raw_nosil=$dst_dir/lexicon_raw_nosil.txt
+
+cmudict_dir=$dst_dir/cmudict
+cmudict_plain=$dst_dir/cmudict.0.7a.plain
+
+mkdir -p $dst_dir || exit 1;
+
+if [ $stage -le 0 ]; then
+  echo "Downloading and preparing CMUdict"
+  if [ ! -s $cmudict_dir/cmudict.0.7a ]; then
+    svn co -r 12440 https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $cmudict_dir || exit 1;
+  fi
+  echo "Removing the pronunciation variant markers ..."
+  grep -v ';;;' $cmudict_dir/cmudict.0.7a | \
+    perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \
+    > $cmudict_plain || exit 1;
+fi
+
+
+if [ $stage -le 1 ]; then
+  # check if we have Sequitur G2P is installed
+  if [ ! -f  "$sequitur" ]; then
+    if ! which swig >&/dev/null; then
+      echo "Please install 'swig' and then run $KALDI_ROOT/tools/extra/install_sequitur.sh"
+      exit 1
+    else
+      echo "Sequitur G2P not found- running $KALDI_ROOT/tools/extra/install_sequitur.sh"
+      pushd $KALDI_ROOT/tools
+      extras/install_sequitur.sh || exit 1
+      popd
+    fi
+  fi
+  [[ -f "$sequitur" ]] || { echo "Still can't find Sequitur G2P- check your path.sh"; exit 1; }
+
+  g2p_dir=$dst_dir/g2p
+  auto_vocab_prefix="$g2p_dir/vocab_autogen"
+  auto_lexicon_prefix="$g2p_dir/lexicon_autogen"
+
+  mkdir -p $g2p_dir/log
+  auto_vocab_splits=$(eval "echo $auto_vocab_prefix.{$(seq -s',' $nj | sed 's/,$//')}")
+  awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $cmudict_plain $vocab |\
+    sort | tee $g2p_dir/vocab_autogen.full |\
+    utils/split_scp.pl /dev/stdin $auto_vocab_splits || exit 1
+  echo "Autogenerating pronunciations for the words in $auto_vocab_prefix.* ..."
+  $cmd JOB=1:$nj $g2p_dir/log/g2p.JOB.log \
+    local/g2p.sh  $auto_vocab_prefix.JOB $g2p_model_dir $auto_lexicon_prefix.JOB || exit 1
+  g2p_vocab_size=$(wc -l <$g2p_dir/vocab_autogen.full)
+  g2p_lex_size=$(wc -l < <(cat $auto_lexicon_prefix.*))
+  [[ "$g2p_vocab_size" -eq "$g2p_lex_size" ]] || { echo "Unexpected G2P error"; exit 1; }
+  sort <(cat $auto_vocab_prefix.*) >$dst_dir/vocab_autogen.txt
+  sort <(cat $auto_lexicon_prefix.*) >$dst_dir/lexicon_autogen.txt
+  echo "$(wc -l <$g2p_dir/vocab_autogen.full) pronunciations autogenerated OK"
+fi
+
+if [ $stage -le 2 ]; then
+  echo "Combining the CMUdict pronunciations with the autogenerated ones ..."
+  awk 'NR==FNR{a[$1]=1; next} ($1 in a)' $vocab $cmudict_plain |\
+    cat - $dst_dir/lexicon_autogen.txt | sort >$lexicon_raw_nosil || exit 1
+  raw_lex_size=$(cat $lexicon_raw_nosil | awk '{print $1}' | sort -u | wc -l)
+  vocab_size=$(wc -l <$vocab)
+  [[ "$vocab_size" -eq "$raw_lex_size" ]] || {
+    echo "Inconsistent lexicon($raw_lex_size) vs vocabulary($vocab_size) size!";
+    exit 1; }
+  echo "Combined lexicon saved to '$lexicon_raw_nosil'"
+fi
+
+# The copy operation below is necessary, if we skip the g2p stages(e.g. using --stage 3)
+if [[ ! -s "$lexicon_raw_nosil" ]]; then
+  cp $lm_dir/librispeech-lexicon.txt $lexicon_raw_nosil || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  silence_phones=$dst_dir/silence_phones.txt
+  optional_silence=$dst_dir/optional_silence.txt
+  nonsil_phones=$dst_dir/nonsilence_phones.txt
+  extra_questions=$dst_dir/extra_questions.txt
+
+  echo "Preparing phone lists and clustering questions"
+  (echo SIL; echo SPN;) > $silence_phones
+  echo SIL > $optional_silence
+  # nonsilence phones; on each line is a list of phones that correspond
+  # really to the same base phone.
+  awk '{for (i=2; i<=NF; ++i) { print $i; gsub(/[0-9]/, "", $i); print $i}}' $lexicon_raw_nosil |\
+    sort -u |\
+    perl -e 'while(<>){
+      chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_";
+      $phones_of{$1} .= "$_ "; }
+      foreach $list (values %phones_of) {print $list . "\n"; } ' | sort \
+      > $nonsil_phones || exit 1;
+  # A few extra questions that will be added to those obtained by automatically clustering
+  # the "real" phones.  These ask about stress; there's also one for silence.
+  cat $silence_phones| awk '{printf("%s ", $1);} END{printf "\n";}' > $extra_questions || exit 1;
+  cat $nonsil_phones | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
+    $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
+    >> $extra_questions || exit 1;
+  echo "$(wc -l <$silence_phones) silence phones saved to: $silence_phones"
+  echo "$(wc -l <$optional_silence) optional silence saved to: $optional_silence"
+  echo "$(wc -l <$nonsil_phones) non-silence phones saved to: $nonsil_phones"
+  echo "$(wc -l <$extra_questions) extra triphone clustering-related questions saved to: $extra_questions"
+fi
+
+if [ $stage -le 4 ]; then
+  (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |\
+  cat - $lexicon_raw_nosil | sort | uniq >$dst_dir/lexicon.txt
+  echo "Lexicon text file saved as: $dst_dir/lexicon.txt"
+fi
+
+exit 0
diff --git a/egs/libri_css/s5_mono/local/rnnlm/train.sh b/egs/libri_css/s5_mono/local/rnnlm/train.sh
new file mode 120000
index 00000000000..8e647598556
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/rnnlm/train.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/libri_css/s5_mono/local/rnnlm/tuning/run_tdnn_lstm_1a.sh b/egs/libri_css/s5_mono/local/rnnlm/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..0fcf4c354b1
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/rnnlm/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,130 @@
+#!/usr/bin/env bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+#           2018  Ke Li
+
+# This script trains LMs on the librispeech-lm-norm.txt.gz.
+
+# rnnlm/train_rnnlm.sh: best iteration (out of 143) was 142, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 109.2 / 110.7.
+# Train objf: -5.74 -5.54 -5.44 -5.37 -5.32 -5.28 -5.25 -5.23 -5.20 -5.18 -5.15 -5.14 -5.12 -5.10 -5.09 -5.08 -5.07 -5.05 -5.04 -5.04 -5.03 -5.02 -5.01 -5.00 -4.99 -4.99 -4.98 -4.97 -4.96 -4.96 -4.95 -4.95 -4.94 -4.93 -4.93 -4.92 -4.92 -4.92 -4.91 -4.90 -4.90 -4.89 -4.89 -4.89 -4.88 -4.88 -4.87 -4.87 -4.87 -4.86 -4.86 -4.86 -4.85 -4.85 -4.84 -4.84 -4.84 -4.84 -4.84 -4.83 -4.83 -4.83 -4.82 -4.82 -4.82 -4.82 -4.81 -4.81 -4.81 -4.81 -4.80 -4.80 -4.80 -4.79 -4.79 -4.79 -4.79 -4.78 -4.79 -4.78 -4.78 -4.78 -4.78 -4.77 -4.77 -4.77 -4.77 -4.77 -4.76 -4.76 -4.76 -4.76 -4.76 -4.75 -4.75 -4.75 -4.75 -4.75 -4.74 -4.74 -4.74 -4.74 -4.74 -4.74 -4.73 -4.74 -4.74 -4.73 -4.73 -4.73 -4.73 -4.73 -4.72 -4.73 -4.73 -4.73 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.71 -4.71 -4.71 -4.71 -4.71 -4.70 -4.70 -4.70 -4.70 -4.70 -4.69 -4.69 -4.69 -4.69 -4.69 -4.69 -4.68 -4.68
+# Dev objf:   -5.99 -5.65 -5.53 -5.44 -5.38 -5.34 -5.30 -5.27 -5.22 -5.20 -5.18 -5.16 -5.14 -5.12 -5.11 -5.10 -5.09 -5.08 -5.07 -5.05 -5.04 -5.04 -5.03 -5.01 -5.00 -4.99 -4.99 -4.98 -4.97 -4.97 0.00 -4.96 -4.95 -4.95 -4.94 -4.93 -4.93 -4.92 -4.92 -4.91 -4.91 -4.90 -4.90 -4.89 -4.89 -4.89 -4.88 -4.88 -4.88 -4.87 -4.87 -4.87 -4.86 -4.86 -4.85 -4.85 -4.87 -4.84 -4.84 -4.84 -4.83 -4.91 -4.83 -4.83 -4.83 -4.82 -4.82 -4.82 -4.82 -4.81 -4.81 -4.81 -4.80 -4.80 -4.80 -4.80 -4.80 -4.79 -4.79 -4.79 -4.79 -4.79 -4.79 -4.78 -4.78 -4.79 -4.78 -4.77 -4.77 -4.77 -4.77 -4.77 -4.77 -4.77 -4.76 -4.76 -4.76 -4.76 -4.76 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.74 -4.74 -4.74 -4.74 -4.74 -4.74 -4.74 -4.73 -4.74 -4.73 -4.73 -4.73 -4.73 -4.73 -4.73 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.71 -4.71 -4.71 -4.71 -4.71 -4.71 -4.71 -4.71
+
+# WER summary on dev and test sets
+# System                      tdnn_1d_sp  +lattice_rescore  +nbest_rescore 
+# WER on dev(fglarge)              3.34         2.71            2.62
+# WER on dev(tglarge)              3.44         2.75            2.66
+# WER on dev_other(fglarge)        8.70         7.37            7.55
+# WER on dev_other(tglarge)        9.25         7.56            7.73
+# WER on test(fglarge)             3.77         3.12            3.06
+# WER on test(tglarge)             3.85         3.18            3.11
+# WER on test_other(fglarge)       8.91         7.63            7.68
+# WER on test_other(tglarge)       9.31         7.83            7.95
+
+# command to get the WERs above:
+# tdnn_1d_sp
+# for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge; do grep WER exp/chain_cleaned/tdnn_1d_sp/decode_${test}_${lm}/wer* | best_wer.sh; done; done
+# tdnn_1d_sp with lattice rescoring 
+# for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge; do grep WER exp/chain_cleaned/tdnn_1d_sp/decode_${test}_${lm}_rnnlm_1a_rescore/wer* | best_wer.sh; done; done
+# tdnn_1d_sp with nbest rescoring 
+# for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge; do grep WER exp/chain_cleaned/tdnn_1d_sp/decode_${test}_${lm}_rnnlm_1a_nbest_rescore/wer* | best_wer.sh; done; done
+
+# Begin configuration section.
+
+dir=exp/rnnlm_lstm_1a
+embedding_dim=1024
+lstm_rpd=256
+lstm_nrpd=256
+stage=-10
+train_stage=-10
+epochs=4
+
+# variables for lattice rescoring
+run_lat_rescore=true
+run_nbest_rescore=true
+run_backward_rnnlm=false
+ac_model_dir=exp/chain_cleaned/tdnn_1d_sp
+decode_dir_suffix=rnnlm_1a
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+pruned_rescore=true
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+text=data/local/lm/librispeech-lm-norm.txt.gz
+lexicon=data/lang_nosp/words.txt
+text_dir=data/rnnlm/text
+mkdir -p $dir/config
+set -e
+
+for f in $lexicon; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; search for run.sh in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  if [ ! -f $text ]; then
+    wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/lm 
+  fi
+  echo -n >$text_dir/dev.txt
+  # hold out one in every 2000 lines as dev data.
+  gunzip -c $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%2000 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/librispeech.txt
+fi
+
+if [ $stage -le 1 ]; then
+  cp $lexicon $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <SPOKEN_NOISE> during training.
+  echo "<UNK>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+librispeech   1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<UNK>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --top-word-features=5000 \
+                           --use-constant-feature=true \
+                           --special-words='<s>,</s>,<brk>,<UNK>,<SPOKEN_NOISE>' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  # the --unigram-factor option is set larger than the default (100)
+  # in order to reduce the size of the sampling LM, because rnnlm-get-egs
+  # was taking up too much CPU (as much as 10 cores).
+  rnnlm/prepare_rnnlm_dir.sh --unigram-factor 400 \
+                            $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-final 8 \
+                       --stage $train_stage \
+                       --num-epochs $epochs \
+                       --cmd "$train_cmd" $dir
+fi
+
+exit 0
diff --git a/egs/libri_css/s5_mono/local/run_cleanup_segmentation.sh b/egs/libri_css/s5_mono/local/run_cleanup_segmentation.sh
new file mode 100755
index 00000000000..b048758df7f
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/run_cleanup_segmentation.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+
+# Copyright 2016  Vimal Manohar
+#           2016  Yiming Wang
+#           2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script demonstrates how to re-segment training data selecting only the
+# "good" audio that matches the transcripts.
+# The basic idea is to decode with an existing in-domain acoustic model, and a
+# biased language model built from the reference, and then work out the
+# segmentation from a ctm like file.
+
+# For nnet3 and chain results after cleanup, see the scripts in
+# local/nnet3/run_tdnn.sh and local/chain/run_tdnn_6z.sh
+
+# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets
+# [will add these later].
+
+set -e
+set -o pipefail
+set -u
+
+stage=0
+cleanup_stage=0
+data=data/train_960
+cleanup_affix=cleaned
+srcdir=exp/tri6b
+nj=100
+decode_nj=16
+decode_num_threads=4
+
+. ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+cleaned_data=${data}_${cleanup_affix}
+
+dir=${srcdir}_${cleanup_affix}_work
+cleaned_dir=${srcdir}_${cleanup_affix}
+
+if [ $stage -le 1 ]; then
+  # This does the actual data cleanup.
+  steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj $nj --cmd "$train_cmd" \
+    $data data/lang $srcdir $dir $cleaned_data
+fi
+
+if [ $stage -le 2 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $cleaned_data data/lang $srcdir ${srcdir}_ali_${cleanup_affix}
+fi
+
+if [ $stage -le 3 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" \
+    7000 150000 $cleaned_data data/lang ${srcdir}_ali_${cleanup_affix} ${cleaned_dir}
+fi
+
+if [ $stage -le 4 ]; then
+  # Test with the models trained on cleaned-up data.
+  utils/mkgraph.sh data/lang_test_tgsmall ${cleaned_dir} ${cleaned_dir}/graph_tgsmall
+fi
+
+exit 0;
diff --git a/egs/libri_css/s5_mono/local/score.sh b/egs/libri_css/s5_mono/local/score.sh
new file mode 120000
index 00000000000..6a200b42ed3
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/score.sh
@@ -0,0 +1 @@
+../steps/scoring/score_kaldi_wer.sh
\ No newline at end of file
diff --git a/egs/libri_css/s5_mono/local/score_reco_diarized.sh b/egs/libri_css/s5_mono/local/score_reco_diarized.sh
new file mode 100755
index 00000000000..c7fbc893bc3
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/score_reco_diarized.sh
@@ -0,0 +1,147 @@
+#!/usr/bin/env bash
+# Apache 2.0
+#
+# This script performs CHiME-6 track 2 style scoring for the diarized data.
+# This means that all permutations of reference and hypothesis speakers are
+# scored and the best one is selected to compute a kind of "speaker-attributed" WER.
+# It calculates the best search parameter configurations by using the dev set
+# and provides wer for dev and eval
+
+cmd=run.pl
+stage=0
+word_ins_penalty=0.0,0.5,1.0
+min_lmwt=7
+max_lmwt=17
+dev_decodedir=
+eval_decodedir=
+dev_datadir=
+eval_datadir=
+multistream=false # Set to true if input audio was separated (e.g. CSS)
+
+echo "$0 $@"  # Print the command line for logging
+
+. ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+conditions="0L 0S OV10 OV20 OV30 OV40"
+
+if [ $# -ne 0 ]; then
+    echo "Usage: $0 [--cmd (run.pl|queue.pl...)]"
+    echo "This script provides CHiME-6 style SA-WER scoring for LibriCSS"
+    echo " Options:"
+    echo "    --cmd (run.pl|queue.pl...)            # specify how to run the sub-processes."
+    echo "    --dev_decodedir <dev-decode-dir>      # dev set decoding directory"
+    echo "    --eval_decodedir <eval-decode-dir>    # eval set decoding directory"
+    echo "    --dev_datadir <dev-data-dir>          # dev set data directory"
+    echo "    --eval_datadir <eval-data-dir>        # eval set data directory"
+    echo "    --min_lmwt <int>                      # minumum LM-weight for lattice rescoring "
+    echo "    --max_lmwt <int>                      # maximum LM-weight for lattice rescoring "
+    echo "    --multistream <true|false>            # set to true if scoring multistream audio"
+    
+    exit 1;
+fi
+
+mkdir -p $dev_decodedir/scoring_kaldi_multispeaker
+
+if [ $stage -le 1 ]; then
+  # obtaining multi speaker WER for all lmwt and wip
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    $cmd LMWT=$min_lmwt:$max_lmwt \
+      $dev_decodedir/scoring_kaldi_multispeaker/multispeaker_score.LMWT.log \
+      local/multispeaker_score.sh --multistream $multistream \
+      --datadir $dev_datadir --get_stats false data/$dev_datadir/text \
+      $dev_decodedir/scoring_kaldi/penalty_$wip/$LMWT.txt \
+      $dev_decodedir/scoring_kaldi_multispeaker/penalty_$wip/$LMWT
+    done
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  # obtaining best lmwt, wip and wer
+  echo "Selecting best LM weight and WIP for condition $cond"
+  grep WER $dev_decodedir/scoring_kaldi_multispeaker/penalty_*/*/per_speaker_wer/best_wer_average | \
+      utils/best_wer.sh >& $dev_decodedir/scoring_kaldi_multispeaker/best_wer_average
+
+  best_wer_file=$(awk '{print $NF}' $dev_decodedir/scoring_kaldi_multispeaker/best_wer_average)
+  best_lmwt=$(echo $best_wer_file | cut -d'/' -f7)
+  best_wip=$(echo $best_wer_file | cut -d'/' -f6 | cut -d'_' -f2)
+
+  # printing and storing best lmwt, best_array and wip
+  echo "best LM weight: $best_lmwt"
+  echo "best insertion penalty weight: $best_wip"
+
+  echo $best_lmwt > $dev_decodedir/scoring_kaldi_multispeaker/lmwt
+  echo $best_wip >  $dev_decodedir/scoring_kaldi_multispeaker/wip
+fi
+
+if [ $stage -le 3 ]; then
+  # Get WER for all conditions for the selected LMWT and WIP and remove other files
+  best_lmwt="$(cat $dev_decodedir/scoring_kaldi_multispeaker/lmwt)"
+  best_wip="$(cat $dev_decodedir/scoring_kaldi_multispeaker/wip)"
+  cat $dev_decodedir/scoring_kaldi_multispeaker/penalty_$best_wip/$best_lmwt/per_speaker_wer/best_wer_all \
+    > $dev_decodedir/scoring_kaldi_multispeaker/best_wer
+  echo "Cleaning up WER files..."
+  find $dev_decodedir/scoring_kaldi_multispeaker/penalty_*/*/per_speaker_wer -maxdepth 1 -name "wer_*" -delete
+
+  # Compute overall WER average
+  cat $dev_decodedir/scoring_kaldi_multispeaker/best_wer | awk '
+    {
+      ERR+=$5; WC+=$7; INS+=$8; DEL+=$10; SUB+=$12;
+    }END{
+      WER=ERR*100/WC;
+      printf("%%WER %.2f [ %d / %d, %d ins, %d del, %d sub ]",WER,ERR,WC,INS,DEL,SUB);
+    }
+    ' > $dev_decodedir/scoring_kaldi_multispeaker/best_wer_average
+fi
+
+# Now scoring the eval set using best LMWT and WIP
+
+if [ $stage -le 4 ]; then
+  # obtaining per recording stats for eval
+  best_lmwt="$(cat $dev_decodedir/scoring_kaldi_multispeaker/lmwt)"
+  best_wip="$(cat $dev_decodedir/scoring_kaldi_multispeaker/wip)"
+  local/multispeaker_score.sh \
+    --multistream $multistream \
+    --datadir $eval_datadir data/$eval_datadir/text \
+    $eval_decodedir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \
+    $eval_decodedir/scoring_kaldi_multispeaker/penalty_$best_wip/$best_lmwt/
+fi
+
+if [ $stage -le 5 ]; then
+  # obtaining eval wer corresponding to best lmwt, best_array and wip of dev
+  best_lmwt="$(cat $dev_decodedir/scoring_kaldi_multispeaker/lmwt)"
+  best_wip="$(cat $dev_decodedir/scoring_kaldi_multispeaker/wip)"
+
+  find $eval_decodedir/scoring_kaldi_multispeaker/penalty_$best_wip/$best_lmwt/per_speaker_wer -maxdepth 1 -name "wer_*" -delete
+
+  # Compute the average WER stats for all conditions individually.
+  wer_dir=$eval_decodedir/scoring_kaldi_multispeaker/penalty_$best_wip/$best_lmwt/per_speaker_wer
+  for cond in $conditions; do
+    grep $cond $wer_dir/best_wer_all | awk -v COND="$cond" '
+      {
+        ERR+=$5; WC+=$7; INS+=$8; DEL+=$10; SUB+=$12;
+      }END{
+        WER=ERR*100/WC;
+        printf("%s %%WER %.2f [ %d / %d, %d ins, %d del, %d sub ]\n",COND,WER,ERR,WC,INS,DEL,SUB);
+      }
+      '
+  done > $eval_decodedir/scoring_kaldi_multispeaker/best_wer
+
+  # Compute overall WER average
+  cat $wer_dir/best_wer_all | awk '
+    {
+      ERR+=$5; WC+=$7; INS+=$8; DEL+=$10; SUB+=$12;
+    }END{
+      WER=ERR*100/WC;
+      printf("%%WER %.2f [ %d / %d, %d ins, %d del, %d sub ]",WER,ERR,WC,INS,DEL,SUB);
+    }
+    ' > $eval_decodedir/scoring_kaldi_multispeaker/best_wer_average
+fi
+
+# printing dev and eval wer
+echo "Dev WERs:" 
+cat $dev_decodedir/scoring_kaldi_multispeaker/best_wer
+echo "Eval WERs:" 
+cat $eval_decodedir/scoring_kaldi_multispeaker/best_wer
+
diff --git a/egs/libri_css/s5_mono/local/score_reco_oracle.sh b/egs/libri_css/s5_mono/local/score_reco_oracle.sh
new file mode 100755
index 00000000000..e3dc1369a46
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/score_reco_oracle.sh
@@ -0,0 +1,107 @@
+#!/usr/bin/env bash
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey, Yenda Trmal)
+# Copyright 2019       Johns Hopkins University (Author: Shinji Watanabe)
+# Apache 2.0
+#
+# This script scores the multi-speaker LibriCSS recordings.
+# It first calculates the best search parameter configurations by using the dev set
+# and then uses these to score both sets.
+
+cmd=run.pl
+dev=exp/chain_cleaned/tdnn_1d_sp/decode_dev
+eval=exp/chain_cleaned/tdnn_1d_sp/decode_eval
+
+conditions="0L 0S OV10 OV20 OV30 OV40"
+
+echo "$0 $@"  # Print the command line for logging
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 0 ]; then
+    echo "Usage: $0 [--cmd (run.pl|queue.pl...)]"
+    echo "This script scores the LibriCSS full recordings"
+    echo " Options:"
+    echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+    echo "    --dev <dev-decode-dir>          # dev set decoding directory"
+    echo "    --eval <eval-decode-dir>        # eval set decoding directory"
+    exit 1;
+fi
+
+# get language model weight and word insertion penalty from the dev set
+best_lmwt=`cat $dev/scoring_kaldi/wer_details/lmwt`
+best_wip=`cat $dev/scoring_kaldi/wer_details/wip`
+
+echo "best LM weight: $best_lmwt"
+echo "insertion penalty weight: $best_wip"
+
+echo "==== development set ===="
+# development set
+# we report scores by overlap type, i.e., 0L, 0S, OV10, and so on.
+
+# get the scores per utterance
+score_result=$dev/scoring_kaldi/wer_details/per_utt
+
+for cond in $conditions; do
+  # get nerror
+  nerr=`grep "\#csid" $score_result | grep $cond | awk '{sum+=$4+$5+$6} END {print sum}'`
+  # get nwords from references (NF-2 means to exclude utterance id and " ref ")
+  nwrd=`grep "\#csid" $score_result | grep $cond | awk '{sum+=$3+$4+$6} END {print sum}'`
+  # compute wer with scale=2
+  wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`  
+  # report the results
+  echo -n "Condition $cond: "
+  echo -n "#words $nwrd, "
+  echo -n "#errors $nerr, "
+  echo "wer $wer %"
+done
+
+echo -n "overall: "
+# get nerror
+nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'`
+# get nwords from references (NF-2 means to exclude utterance id and " ref ")
+nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'`
+# compute wer with scale=2
+wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
+echo -n "#words $nwrd, "
+echo -n "#errors $nerr, "
+echo "wer $wer %"
+
+echo "==== evaluation set ===="
+# evaluation set
+# get the scoring result per utterance. Copied from local/score.sh
+mkdir -p $eval/scoring_kaldi/wer_details_devbest
+$cmd $eval/scoring_kaldi/log/stats1.log \
+     cat $eval/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \
+     align-text --special-symbol="'***'" ark:$eval/scoring_kaldi/test_filt.txt ark:- ark,t:- \|  \
+     utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \> $eval/scoring_kaldi/wer_details_devbest/per_utt
+
+score_result=$eval/scoring_kaldi/wer_details_devbest/per_utt
+
+for cond in $conditions; do
+    # get nerror
+    nerr=`grep "\#csid" $score_result | grep $cond | awk '{sum+=$4+$5+$6} END {print sum}'`
+    # get nwords from references (NF-2 means to exclude utterance id and " ref ")
+    nwrd=`grep "\#csid" $score_result | grep $cond | awk '{sum+=$3+$4+$6} END {print sum}'`
+    # compute wer with scale=2
+    wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
+
+    # report the results
+    echo -n "Condition $cond: "
+    echo -n "#words $nwrd, "
+    echo -n "#errors $nerr, "
+    echo "wer $wer %"
+done
+
+echo -n "overall: "
+# get nerror
+nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'`
+# get nwords from references (NF-2 means to exclude utterance id and " ref ")
+nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'`
+# compute wer with scale=2
+wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
+echo -n "overall: "
+echo -n "#words $nwrd, "
+echo -n "#errors $nerr, "
+echo "wer $wer %"
+
+
diff --git a/egs/libri_css/s5_mono/local/segmentation/apply_webrtcvad.py b/egs/libri_css/s5_mono/local/segmentation/apply_webrtcvad.py
new file mode 100755
index 00000000000..2fec7e575e6
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/segmentation/apply_webrtcvad.py
@@ -0,0 +1,212 @@
+#!/usr/bin/python3
+#
+# This script gets speech segments from whole recordings using webrtcvad
+# Modified from: https://github.com/wiseman/py-webrtcvad/blob/master/example.py
+#
+# Copyright  2020  Johns Hopkins University (Author: Desh Raj)
+# Apache 2.0
+
+import collections, sys, os, argparse, contextlib
+import wave
+import webrtcvad
+
+def get_args():
+	parser = argparse.ArgumentParser(description="Obtain speech segments for all wav files in a dir."
+        " Writes the output to the stdout." 
+		"Usage: apply_webrtcvad.py [options...] <data-dir>"
+		"E.g.: apply_webrtcvad.py --aggressiveness 2 --reco2channels data/reco2channels data",
+		formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+	parser.add_argument("--mode", type=int, dest = "mode", default=1,
+		help="Integer in {0,1,2,3} specifying the VAD aggressiveness. 0 is the least aggressive"
+        " about filtering out non-speech, 3 is the most aggressive.")
+
+	parser.add_argument("--reco2channels", type=str, dest="reco2ch_file",
+		help="In multi-channel setting, specifying this would avoid computing VAD for each channel"
+        " separately. Only first channel will be used to compute VAD and all channels will share.")
+
+	parser.add_argument("data_dir", help="Data directory containing wav.scp")
+
+	args = parser.parse_args()
+
+	return args
+
+def check_args(args):
+    if (args.mode not in [0,1,2,3]):
+        raise Exception("Aggressiveness mode must be in {0,1,2,3}")
+    if (not os.path.exists(os.path.join(args.data_dir,'wav.scp'))):
+        raise Exception("No wav.scp file exists")
+    return
+
+def read_wave(path):
+    """Reads a .wav file.
+    Takes the path, and returns (PCM audio data, sample rate).
+    """
+    with contextlib.closing(wave.open(path, 'rb')) as wf:
+        num_channels = wf.getnchannels()
+        assert num_channels == 1
+        sample_width = wf.getsampwidth()
+        assert sample_width == 2
+        sample_rate = wf.getframerate()
+        assert sample_rate in (8000, 16000, 32000, 48000)
+        pcm_data = wf.readframes(wf.getnframes())
+        return pcm_data, sample_rate
+
+
+class Frame(object):
+    """Represents a "frame" of audio data."""
+    def __init__(self, bytes, timestamp, duration):
+        self.bytes = bytes
+        self.timestamp = timestamp
+        self.duration = duration
+
+
+def frame_generator(frame_duration_ms, audio, sample_rate):
+    """Generates audio frames from PCM audio data.
+    Takes the desired frame duration in milliseconds, the PCM data, and
+    the sample rate.
+    Yields Frames of the requested duration.
+    """
+    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
+    offset = 0
+    timestamp = 0.0
+    duration = (float(n) / sample_rate) / 2.0
+    while offset + n < len(audio):
+        yield Frame(audio[offset:offset + n], timestamp, duration)
+        timestamp += duration
+        offset += n
+
+
+def vad_segments(sample_rate, frame_duration_ms,
+                  padding_duration_ms, vad, frames):
+    """Filters out non-voiced audio frames.
+    Given a webrtcvad.Vad and a source of audio frames, yields only
+    the voiced audio.
+    Uses a padded, sliding window algorithm over the audio frames.
+    When more than 90% of the frames in the window are voiced (as
+    reported by the VAD), the collector triggers and begins yielding
+    audio frames. Then the collector waits until 90% of the frames in
+    the window are unvoiced to detrigger.
+    The window is padded at the front and back to provide a small
+    amount of silence or the beginnings/endings of speech around the
+    voiced frames.
+    Arguments:
+    sample_rate - The audio sample rate, in Hz.
+    frame_duration_ms - The frame duration in milliseconds.
+    padding_duration_ms - The amount to pad the window, in milliseconds.
+    vad - An instance of webrtcvad.Vad.
+    frames - a source of audio frames (sequence or generator).
+    Returns: List of (start_time,end_time) tuples.
+    """
+    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
+    # We use a deque for our sliding window/ring buffer.
+    ring_buffer = collections.deque(maxlen=num_padding_frames)
+    # We have two states: TRIGGERED and NOTTRIGGERED. We start in the
+    # NOTTRIGGERED state.
+    triggered = False
+    segments = []
+    voiced_frames = []
+    for frame in frames:
+        is_speech = vad.is_speech(frame.bytes, sample_rate)
+
+        if not triggered:
+            ring_buffer.append((frame, is_speech))
+            num_voiced = len([f for f, speech in ring_buffer if speech])
+            # If we're NOTTRIGGERED and more than 90% of the frames in
+            # the ring buffer are voiced frames, then enter the
+            # TRIGGERED state.
+            if num_voiced > 0.9 * ring_buffer.maxlen:
+                triggered = True
+                for f, s in ring_buffer:
+                    voiced_frames.append(f)
+                start_time = voiced_frames[0].timestamp
+                ring_buffer.clear()
+        else:
+            # We're in the TRIGGERED state, so collect the audio data
+            # and add it to the ring buffer.
+            voiced_frames.append(frame)
+            ring_buffer.append((frame, is_speech))
+            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
+            # If more than 90% of the frames in the ring buffer are
+            # unvoiced, then enter NOTTRIGGERED and yield whatever
+            # audio we've collected.
+            if num_unvoiced > 0.9 * ring_buffer.maxlen:
+                end_time = frame.timestamp + frame.duration
+                triggered = False
+                ring_buffer.clear()
+                voiced_frames = []
+                # Write to segments list
+                segments.append((start_time, end_time))
+    # If we have any leftover voiced audio when we run out of input,
+    # add it to segments list.
+    if voiced_frames:
+        end_time = voiced_frames[-1].timestamp
+        segments.append((start_time, end_time))
+    return segments
+
+
+def get_reco2channels(reco2ch_file):
+    """
+    Given a file containing reco id and channel ids for the recording, return
+    the corresponding dictionary.
+    """
+    reco2channels = {}
+    with open(reco2ch_file, 'r') as f:
+        for line in f.readlines():
+            reco, channels = line.strip.split(maxsplit=1)
+            channels = channels.split()
+            reco2channels[reco] = channels
+    return reco2channels
+
+def get_wav_list(data_dir, reco2channels=None):
+    """
+    Return a dictionary of uttid with wav paths. Optionally takes reco2channels and,
+    if provided, the uttid is actually the recoid.
+    """
+    if reco2channels is not None:
+        keep_wavs = {reco2channels[reco][0]:reco for reco in reco2channels.keys()}
+    wav_list = {}
+    with open(os.path.join(data_dir,'wav.scp'),'r') as f:
+        for line in f.readlines():
+            utt, wav = line.strip().split()
+            if reco2channels is not None:
+                if utt in keep_wavs:
+                    wav_list[keep_wavs[utt]] = wav
+            else:
+                wav_list[utt] = wav
+    return wav_list
+
+def get_speech_segments(uttid, wav, vad):
+    """
+    Compute and print the segments for the given uttid. It is in the format:
+    <segment-id> <utt-id> <start-time> <end-time>
+    """
+    audio, sample_rate = read_wave(wav)
+    frames = frame_generator(30, audio, sample_rate)
+    frames = list(frames)
+    segments = vad_segments(sample_rate, 30, 300, vad, frames)
+    for segment in segments:
+        start = float("{:.2f}".format(segment[0]))
+        end = float("{:.2f}".format(segment[1]))
+        segment_id = '{}_{}_{}'.format(uttid,'{:.0f}'.format(100*start).zfill(6), '{:.0f}'.format(100*end).zfill(6))
+        print ("{} {} {} {}".format(segment_id, uttid, start, end))
+    return
+
+def main():
+    # First we read and check the arguments
+    args = get_args()
+    check_args(args)
+    
+    if (args.reco2ch_file is not None):
+        reco2channels = get_reco2channels(args.reco2ch_file)
+        wav_list = get_wav_list(args.data_dir, reco2channels)
+    else:
+        wav_list = get_wav_list(args.data_dir)
+
+    vad = webrtcvad.Vad(args.mode)
+    for utt in wav_list.keys():
+        get_speech_segments(utt, wav_list[utt], vad)
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/egs/libri_css/s5_mono/local/segmentation/detect_speech_activity.sh b/egs/libri_css/s5_mono/local/segmentation/detect_speech_activity.sh
new file mode 100755
index 00000000000..c9719d472f3
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/segmentation/detect_speech_activity.sh
@@ -0,0 +1,217 @@
+#!/usr/bin/env bash
+
+# Copyright 2016-17  Vimal Manohar
+#              2017  Nagendra Kumar Goel
+# Apache 2.0.
+
+# This script does nnet3-based speech activity detection given an input 
+# kaldi data directory and outputs a segmented kaldi data directory.
+# This script can also do music detection and other similar segmentation
+# using appropriate options such as --output-name output-music.
+
+set -e 
+set -o pipefail
+set -u
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+
+affix=  # Affix for the segmentation
+nj=32
+cmd=queue.pl
+stage=-1
+
+# Feature options (Must match training)
+mfcc_config=conf/mfcc_hires.conf
+feat_affix=   # Affix for the type of feature used
+
+output_name=output   # The output node in the network
+sad_name=sad    # Base name for the directory storing the computed loglikes
+                # Can be music for music detection
+segmentation_name=segmentation  # Base name for the directory doing segmentation
+                                # Can be segmentation_music for music detection
+
+# SAD network config
+iter=final  # Model iteration to use
+
+# Contexts must ideally match training for LSTM models, but
+# may not necessarily for stats components
+extra_left_context=0  # Set to some large value, typically 40 for LSTM (must match training)
+extra_right_context=0  
+extra_left_context_initial=-1
+extra_right_context_final=-1
+frames_per_chunk=150
+
+# Decoding options
+graph_opts="--min-silence-duration=0.03 --min-speech-duration=0.3 --max-speech-duration=10.0"
+acwt=0.3
+
+# These <from>_in_<to>_weight represent the fraction of <from> probability 
+# to transfer to <to> class.
+# e.g. --speech-in-sil-weight=0.0 --garbage-in-sil-weight=0.0 --sil-in-speech-weight=0.0 --garbage-in-speech-weight=0.3
+transform_probs_opts=""
+
+# Postprocessing options
+segment_padding=0.2   # Duration (in seconds) of padding added to segments 
+min_segment_dur=0   # Minimum duration (in seconds) required for a segment to be included
+                    # This is before any padding. Segments shorter than this duration will be removed.
+                    # This is an alternative to --min-speech-duration above.
+merge_consecutive_max_dur=0   # Merge consecutive segments as long as the merged segment is no longer than this many
+                              # seconds. The segments are only merged if their boundaries are touching.
+                              # This is after padding by --segment-padding seconds.
+                              # 0 means do not merge. Use 'inf' to not limit the duration.
+
+echo $* 
+
+. utils/parse_options.sh
+
+if [ $# -ne 5 ]; then
+  echo "This script does nnet3-based speech activity detection given an input kaldi "
+  echo "data directory and outputs an output kaldi data directory."
+  echo "See script for details of the options to be supplied."
+  echo "Usage: $0 <src-data-dir> <sad-nnet-dir> <mfcc-dir> <work-dir> <out-data-dir>"
+  echo " e.g.: $0 ~/workspace/egs/ami/s5b/data/sdm1/dev exp/nnet3_sad_snr/nnet_tdnn_j_n4 \\"
+  echo "    mfcc_hires exp/segmentation_sad_snr/nnet_tdnn_j_n4 data/ami_sdm1_dev"
+  echo ""
+  echo "Options: "
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --nj <num-job>                                 # number of parallel jobs to run."
+  echo "  --stage <stage>                                # stage to do partial re-run from."
+  echo "  --convert-data-dir-to-whole <true|false>    # If true, the input data directory is "
+  echo "                                              # first converted to whole data directory (i.e. whole recordings) "
+  echo "                                              # and segmentation is done on that."
+  echo "                                              # If false, then the original segments are "
+  echo "                                              # retained and they are split into sub-segments."
+  echo "  --output-name <name>    # The output node in the network"
+  echo "  --extra-left-context  <context|0>   # Set to some large value, typically 40 for LSTM (must match training)"
+  echo "  --extra-right-context  <context|0>   # For BLSTM or statistics pooling"
+  exit 1
+fi
+
+src_data_dir=$1   # The input data directory that needs to be segmented.
+                  # If convert_data_dir_to_whole is true, any segments in that will be ignored.
+sad_nnet_dir=$2   # The SAD neural network
+mfcc_dir=$3       # The directory to store the features
+dir=$4            # Work directory
+data_dir=$5       # The output data directory will be ${data_dir}_seg
+
+affix=${affix:+_$affix}
+feat_affix=${feat_affix:+_$feat_affix}
+
+data_id=`basename $data_dir`
+sad_dir=${dir}/${sad_name}${affix}_${data_id}${feat_affix}
+seg_dir=${dir}/${segmentation_name}${affix}_${data_id}${feat_affix}
+test_data_dir=data/${data_id}${feat_affix}
+
+###############################################################################
+## Forward pass through the network network and dump the log-likelihoods.
+###############################################################################
+
+frame_subsampling_factor=1
+if [ -f $sad_nnet_dir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $sad_nnet_dir/frame_subsampling_factor)
+fi
+
+mkdir -p $dir
+if [ $stage -le 1 ]; then
+  if [ "$(readlink -f $sad_nnet_dir)" != "$(readlink -f $dir)" ]; then
+    cp $sad_nnet_dir/cmvn_opts $dir || exit 1
+  fi
+
+  ########################################################################
+  ## Initialize neural network for decoding using the output $output_name
+  ########################################################################
+
+  if [ ! -z "$output_name" ] && [ "$output_name" != output ]; then
+    $cmd $dir/log/get_nnet_${output_name}.log \
+      nnet3-copy --edits="rename-node old-name=$output_name new-name=output" \
+      $sad_nnet_dir/$iter.raw $dir/${iter}_${output_name}.raw || exit 1
+    iter=${iter}_${output_name}
+  else 
+    if ! diff $sad_nnet_dir/$iter.raw $dir/$iter.raw; then
+      cp $sad_nnet_dir/$iter.raw $dir/
+    fi
+  fi
+
+  steps/nnet3/compute_output.sh --nj $nj --cmd "$cmd" \
+    --iter ${iter} \
+    --extra-left-context $extra_left_context \
+    --extra-right-context $extra_right_context \
+    --extra-left-context-initial $extra_left_context_initial \
+    --extra-right-context-final $extra_right_context_final \
+    --frames-per-chunk $frames_per_chunk --apply-exp true \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    ${test_data_dir} $dir $sad_dir || exit 1
+fi
+
+###############################################################################
+## Prepare FST we search to make speech/silence decisions.
+###############################################################################
+
+utils/data/get_utt2dur.sh --nj $nj --cmd "$cmd" $test_data_dir || exit 1
+frame_shift=$(utils/data/get_frame_shift.sh $test_data_dir) || exit 1
+
+graph_dir=${dir}/graph_${output_name}
+if [ $stage -le 2 ]; then
+  mkdir -p $graph_dir
+
+  # 1 for silence and 2 for speech
+  cat <<EOF > $graph_dir/words.txt
+<eps> 0
+silence 1
+speech 2
+EOF
+
+  $cmd $graph_dir/log/make_graph.log \
+    steps/segmentation/internal/prepare_sad_graph.py $graph_opts \
+      --frame-shift=$(perl -e "print $frame_shift * $frame_subsampling_factor") - \| \
+    fstcompile --isymbols=$graph_dir/words.txt --osymbols=$graph_dir/words.txt '>' \
+      $graph_dir/HCLG.fst
+fi
+
+###############################################################################
+## Do Viterbi decoding to create per-frame alignments.
+###############################################################################
+
+post_vec=$sad_nnet_dir/post_${output_name}.vec
+if [ ! -f $sad_nnet_dir/post_${output_name}.vec ]; then
+  if [ ! -f $sad_nnet_dir/post_${output_name}.txt ]; then
+    echo "$0: Could not find $sad_nnet_dir/post_${output_name}.vec. "
+    echo "Re-run the corresponding stage in the training script possibly "
+    echo "with --compute-average-posteriors=true or compute the priors "
+    echo "from the training labels"
+    exit 1
+  else
+    post_vec=$sad_nnet_dir/post_${output_name}.txt
+  fi
+fi
+
+mkdir -p $seg_dir
+if [ $stage -le 3 ]; then
+  steps/segmentation/internal/get_transform_probs_mat.py \
+    --priors="$post_vec" $transform_probs_opts > $seg_dir/transform_probs.mat
+
+  steps/segmentation/decode_sad.sh --acwt $acwt --cmd "$cmd" \
+    --nj $nj \
+    --transform "$seg_dir/transform_probs.mat" \
+    $graph_dir $sad_dir $seg_dir
+fi
+
+###############################################################################
+## Post-process segmentation to create kaldi data directory.
+###############################################################################
+
+if [ $stage -le 4 ]; then
+  steps/segmentation/post_process_sad_to_segments.sh \
+    --segment-padding $segment_padding --min-segment-dur $min_segment_dur \
+    --merge-consecutive-max-dur $merge_consecutive_max_dur \
+    --cmd "$cmd" --frame-shift $(perl -e "print $frame_subsampling_factor * $frame_shift") \
+    ${test_data_dir} ${seg_dir} ${seg_dir}
+fi
+
+if [ $stage -le 5 ]; then
+  utils/data/subsegment_data_dir.sh ${test_data_dir} ${seg_dir}/segments \
+    ${data_dir}_seg
+fi
+
+echo "$0: Created output segmented kaldi data directory in ${data_dir}_seg"
+exit 0
diff --git a/egs/libri_css/s5_mono/local/train_asr.sh b/egs/libri_css/s5_mono/local/train_asr.sh
new file mode 100755
index 00000000000..99043607f2a
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/train_asr.sh
@@ -0,0 +1,205 @@
+# This script is called from run.sh. It downloads the Librispeech
+# data and trains an ASR model on it.
+
+nj=50
+stage=0
+
+. ./utils/parse_options.sh
+
+echo >&2 "$0" "$@"
+if [ $# -ne 1 ] ; then
+  echo >&2 "$0" "$@"
+  echo >&2 "$0: Error: wrong number of arguments"
+  echo -e >&2 "Usage:\n  $0 [opts] <corpus-dir>"
+  echo -e >&2 "eg:\n  $0 /export/corpora/Librispeech"
+  exit 1
+fi
+data=$1
+
+train_sets="train_clean_100 train_clean_360 train_other_500"
+
+. ./cmd.sh
+. ./path.sh
+
+set -e # exit on error
+
+# base url for downloads.
+data_url=www.openslr.org/resources/12
+lm_url=www.openslr.org/resources/11
+mfccdir=mfcc
+
+
+if [ $stage -le 1 ]; then
+  # download the data. 
+  for part in train-clean-100 train-clean-360 train-other-500; do
+    # local/download_and_untar.sh $data $data_url $part
+    local/data_prep_librispeech.sh $data/$part \
+      data/$(echo $part | sed s/-/_/g)
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  # spread the mfccs over various machines, as this data-set is quite large.
+  if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then
+    mfcc=$(basename mfccdir) # in case was absolute pathname (unlikely), get basename.
+    utils/create_split_dir.pl /export/b{02,11,12,13}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \
+     $mfccdir/storage
+  fi
+fi
+
+
+if [ $stage -le 3 ]; then
+  for part in $train_sets; do
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj data/$part exp/make_mfcc/$part $mfccdir
+    steps/compute_cmvn_stats.sh data/$part exp/make_mfcc/$part $mfccdir
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  # Make some small data subsets for early system-build stages.  Note, there are 29k
+  # utterances in the train_clean_100 directory which has 100 hours of data.
+  # For the monophone stages we select the shortest utterances, which should make it
+  # easier to align the data from a flat start.
+
+  utils/subset_data_dir.sh --shortest data/train_clean_100 2000 data/train_2kshort
+  utils/subset_data_dir.sh data/train_clean_100 5000 data/train_5k
+  utils/subset_data_dir.sh data/train_clean_100 10000 data/train_10k
+
+  # We also combine the clean data which will be used to train the larger SAT model
+  utils/combine_data.sh \
+    data/train_clean_460 data/train_clean_100 data/train_clean_360
+
+  # And combine all 960h data, which will be used to train the nnet
+  utils/combine_data.sh \
+      data/train_960 data/train_clean_460 data/train_other_500
+fi
+
+if [ $stage -le 5 ]; then
+  # download the LM resources
+  local/download_lm.sh $lm_url data/local/lm
+fi
+
+if [ $stage -le 6 ]; then
+  # when the "--stage 3" option is used below we skip the G2P steps, and use the
+  # lexicon we have already downloaded from openslr.org/11/
+  local/prepare_dict.sh --stage 3 --nj 30 --cmd "$train_cmd" \
+   data/local/lm data/local/lm data/local/dict_nosp
+
+  utils/prepare_lang.sh data/local/dict_nosp \
+   "<UNK>" data/local/lang_tmp_nosp data/lang_nosp
+
+  local/format_lms.sh --src-dir data/lang_nosp data/local/lm
+fi
+
+if [ $stage -le 7 ]; then
+  # Create ConstArpaLm format language model for full 3-gram and 4-gram LMs
+  utils/build_const_arpa_lm.sh data/local/lm/lm_tglarge.arpa.gz \
+    data/lang_nosp data/lang_nosp_test_tglarge
+  utils/build_const_arpa_lm.sh data/local/lm/lm_fglarge.arpa.gz \
+    data/lang_nosp data/lang_nosp_test_fglarge
+fi
+
+if [ $stage -le 8 ]; then
+  # train a monophone system
+  steps/train_mono.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \
+                    data/train_2kshort data/lang_nosp exp/mono
+fi
+
+if [ $stage -le 9 ]; then
+  steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
+                    data/train_5k data/lang_nosp exp/mono exp/mono_ali_5k
+
+  # train a first delta + delta-delta triphone system on a subset of 5000 utterances
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
+                        2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1
+fi
+
+if [ $stage -le 10 ]; then
+  steps/align_si.sh --nj 10 --cmd "$train_cmd" \
+                    data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k
+
+
+  # train an LDA+MLLT system.
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+                          --splice-opts "--left-context=3 --right-context=3" 2500 15000 \
+                          data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2b
+fi
+
+if [ $stage -le 11 ]; then
+  # Align a 10k utts subset using the tri2b model
+  steps/align_si.sh  --nj 10 --cmd "$train_cmd" --use-graphs true \
+                     data/train_10k data/lang_nosp exp/tri2b exp/tri2b_ali_10k
+
+  # Train tri3b, which is LDA+MLLT+SAT on 10k utts
+  steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \
+                     data/train_10k data/lang_nosp exp/tri2b_ali_10k exp/tri3b
+fi
+
+if [ $stage -le 12 ]; then
+  # align the entire train_clean_100 subset using the tri3b model
+  steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
+    data/train_clean_100 data/lang_nosp \
+    exp/tri3b exp/tri3b_ali_clean_100
+
+  # train another LDA+MLLT+SAT system on the entire 100 hour subset
+  steps/train_sat.sh  --cmd "$train_cmd" 4200 40000 \
+                      data/train_clean_100 data/lang_nosp \
+                      exp/tri3b_ali_clean_100 exp/tri4b
+fi
+
+if [ $stage -le 13 ]; then
+  # Now we compute the pronunciation and silence probabilities from training data,
+  # and re-create the lang directory.
+  steps/get_prons.sh --cmd "$train_cmd" \
+                     data/train_clean_100 data/lang_nosp exp/tri4b
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+                                  data/local/dict_nosp \
+                                  exp/tri4b/pron_counts_nowb.txt exp/tri4b/sil_counts_nowb.txt \
+                                  exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict
+
+  utils/prepare_lang.sh data/local/dict \
+                        "<UNK>" data/local/lang_tmp data/lang
+  local/format_lms.sh --src-dir data/lang data/local/lm
+
+  utils/build_const_arpa_lm.sh \
+    data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge
+  utils/build_const_arpa_lm.sh \
+    data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge
+fi
+
+if [ $stage -le 14 ]; then
+  # align the 460h clean set using the tri4b model
+  steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+                       data/train_clean_460 data/lang exp/tri4b exp/tri4b_ali_clean_460
+
+  # create a larger SAT model, trained on the 460 hours of data.
+  steps/train_sat.sh  --cmd "$train_cmd" 5000 100000 \
+                      data/train_clean_460 data/lang exp/tri4b_ali_clean_460 exp/tri5b
+fi
+
+if [ $stage -le 15 ]; then
+  steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+                       data/train_960 data/lang exp/tri5b exp/tri5b_ali_960
+
+  # train a SAT model on the 960 hour mixed data.  Use the train_quick.sh script
+  # as it is faster.
+  steps/train_quick.sh --cmd "$train_cmd" \
+                       7000 150000 data/train_960 data/lang exp/tri5b_ali_960 exp/tri6b
+fi
+
+if [ $stage -le 16 ]; then
+  # this does some data-cleaning. The cleaned data should be useful when we add
+  # the neural net and chain systems.  (although actually it was pretty clean already.)
+  local/run_cleanup_segmentation.sh
+fi
+
+if [ $stage -le 17 ]; then
+  # train and test nnet3 tdnn models on the entire data with data-cleaning.
+  local/chain/run_tdnn.sh
+
+  # Fine tune with reverberated Librispeech data
+  local/chain/tuning/run_tdnn_1d_ft.sh
+fi
+
+
+exit 0
\ No newline at end of file
diff --git a/egs/libri_css/s5_mono/local/train_diarizer.sh b/egs/libri_css/s5_mono/local/train_diarizer.sh
new file mode 100755
index 00000000000..6fc7156cf8b
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/train_diarizer.sh
@@ -0,0 +1,186 @@
+#!/usr/bin/env bash
+# Copyright
+#        2019   David Snyder
+# Apache 2.0.
+#
+# This script is based on the run.sh script in the Voxceleb v2 recipe.
+# It trains an x-vector DNN for diarization.
+
+mfccdir=`pwd`/mfcc
+vaddir=`pwd`/mfcc
+
+voxceleb1_root=/export/corpora/VoxCeleb1
+voxceleb2_root=/export/corpora/VoxCeleb2
+data_dir=train_other_500
+model_dir=exp/xvector_nnet_1a
+
+stage=0
+train_stage=-1
+
+. ./cmd.sh
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+set -e -u -o pipefail
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+  exit 1
+fi
+
+if [ $stage -le 0 ]; then
+  echo "$0: preparing voxceleb 2 data"
+  local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
+  local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
+
+  echo "$0: preparing voxceleb 1 data (see comments if this step fails)"
+  # The format of the voxceleb 1 corpus has changed several times since it was
+  # released.  Therefore, our dataprep scripts may or may not fail depending
+  # on the version of the corpus you obtained.
+  # If you downloaded the corpus soon after it was first released, this
+  # version of the dataprep script might work:
+  local/make_voxceleb1.pl $voxceleb1_root data/voxceleb1
+  # However, if you've downloaded the corpus recently, you may need to use the
+  # the following scripts instead:
+  #local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
+  #local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test
+
+  # We should now have about 7,351 speakers and 1,277,503 utterances.
+  utils/combine_data.sh data/voxceleb data/voxceleb2_train data/voxceleb2_test
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: preparing features for training data (voxceleb 1 + 2)"
+  steps/make_mfcc.sh --write-utt2num-frames true \
+    --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \
+    data/voxceleb exp/make_mfcc $mfccdir
+  utils/fix_data_dir.sh data/voxceleb
+  # Note that we apply CMN to the MFCCs and write these to the disk.  These
+  # features will later be used to train the x-vector DNN.
+fi
+
+# In this section, we augment the voxceleb data with reverberation.
+# Note that we can probably improve the x-vector DNN if we include
+# augmentations from the nonspeech regions of the Chime 6 training
+# dataset.
+if [ $stage -le 2 ]; then
+  echo "$0: applying augmentation to x-vector training data (just reverb for now)"
+  frame_shift=0.01
+  awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/voxceleb/utt2num_frames > data/voxceleb/reco2dur
+
+  if [ ! -d "RIRS_NOISES" ]; then
+    echo "$0: downloading simulated room impulse response dataset"
+    # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+    unzip rirs_noises.zip
+  fi
+
+  # Make a version with reverberated speech
+  rvb_opts=()
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
+
+  # Make a reverberated version of the training data.  Note that we don't add any
+  # additive noise here.
+  steps/data/reverberate_data_dir.py \
+    "${rvb_opts[@]}" \
+    --speech-rvb-probability 1 \
+    --pointsource-noise-addition-probability 0 \
+    --isotropic-noise-addition-probability 0 \
+    --num-replications 1 \
+    --source-sampling-rate 16000 \
+    data/voxceleb data/voxceleb_reverb
+  utils/copy_data_dir.sh --utt-suffix "-reverb" data/voxceleb_reverb data/voxceleb_reverb.new
+  rm -rf data/voxceleb_reverb
+  mv data/voxceleb_reverb.new data/voxceleb_reverb
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: making MFCCs for augmented training data"
+  # Make MFCCs for the augmented data.  Note that we do not compute a new
+  # vad.scp file here.  Instead, we use the vad.scp from the clean version of
+  # the list.
+  steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \
+    data/voxceleb_reverb exp/make_mfcc $mfccdir
+  # Combine the clean and augmented training data.  This is now roughly
+  # double the size of the original clean list.
+  utils/combine_data.sh data/voxceleb_combined data/voxceleb_reverb data/voxceleb
+fi
+
+# Now we prepare the features to generate examples for xvector training.
+if [ $stage -le 4 ]; then
+  # This script applies CMVN and removes nonspeech frames.  Note that this is somewhat
+  # wasteful, as it roughly doubles the amount of training data on disk.  After
+  # creating voxceleb examples, this can be removed.
+  echo "$0: preparing features to train x-vector DNN"
+  local/nnet3/xvector/prepare_feats.sh --nj 40 --cmd "$train_cmd" \
+    data/voxceleb_combined data/voxceleb_combined_cmn exp/voxceleb_combined_cmn
+  utils/fix_data_dir.sh data/voxceleb_combined_cmn
+fi
+
+if [ $stage -le 5 ]; then
+  # Now, we need to remove features that are too short after removing silence
+  # frames.  We want at least 4s (400 frames) per utterance.
+  min_len=400
+  mv data/voxceleb_combined_cmn/utt2num_frames data/voxceleb_combined_cmn/utt2num_frames.bak
+  awk -v min_len=${min_len} '$2 > min_len {print $1, $2}' data/voxceleb_combined_cmn/utt2num_frames.bak > data/voxceleb_combined_cmn/utt2num_frames
+  utils/filter_scp.pl data/voxceleb_combined_cmn/utt2num_frames data/voxceleb_combined_cmn/utt2spk > data/voxceleb_combined_cmn/utt2spk.new
+  mv data/voxceleb_combined_cmn/utt2spk.new data/voxceleb_combined_cmn/utt2spk
+  utils/fix_data_dir.sh data/voxceleb_combined_cmn
+
+  # We also want several utterances per speaker. Now we'll throw out speakers
+  # with fewer than 8 utterances.
+  min_num_utts=8
+  awk '{print $1, NF-1}' data/voxceleb_combined_cmn/spk2utt > data/voxceleb_combined_cmn/spk2num
+  awk -v min_num_utts=${min_num_utts} '$2 >= min_num_utts {print $1, $2}' data/voxceleb_combined_cmn/spk2num | utils/filter_scp.pl - data/voxceleb_combined_cmn/spk2utt > data/voxceleb_combined_cmn/spk2utt.new
+  mv data/voxceleb_combined_cmn/spk2utt.new data/voxceleb_combined_cmn/spk2utt
+  utils/spk2utt_to_utt2spk.pl data/voxceleb_combined_cmn/spk2utt > data/voxceleb_combined_cmn/utt2spk
+
+  utils/filter_scp.pl data/voxceleb_combined_cmn/utt2spk data/voxceleb_combined_cmn/utt2num_frames > data/voxceleb_combined_cmn/utt2num_frames.new
+  mv data/voxceleb_combined_cmn/utt2num_frames.new data/voxceleb_combined_cmn/utt2num_frames
+
+  utils/fix_data_dir.sh data/voxceleb_combined_cmn
+fi
+
+# Stages 6 through 8 are handled in run_xvector.sh.
+# This script trains the x-vector DNN on the augmented voxceleb data.
+local/nnet3/xvector/run_xvector.sh --stage $stage --train-stage $train_stage \
+  --data data/voxceleb_combined_cmn --nnet-dir $model_dir \
+  --egs-dir $model_dir/egs
+
+if [ $stage -le 9 ]; then
+  echo "$0: preparing a subset of Librispeech data to train PLDA model"
+  utils/subset_data_dir.sh ${data_dir} 100000 data/plda_train
+  steps/make_mfcc.sh --write-utt2num-frames true \
+    --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \
+    data/plda_train exp/make_mfcc $mfccdir
+  utils/fix_data_dir.sh data/plda_train
+  local/nnet3/xvector/prepare_feats.sh --nj 40 --cmd "$train_cmd" \
+    data/plda_train data/plda_train_cmn exp/plda_train_cmn
+  if [ -f data/plda_train/segments ]; then
+    cp data/plda_train/segments data/plda_train_cmn/
+  fi
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: extracting x-vector for PLDA training data"
+  utils/fix_data_dir.sh data/plda_train_cmn
+  diarization/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 10G" \
+    --nj 40 --window 3.0 --period 10.0 --min-segment 1.5 --apply-cmn false \
+    --hard-min true $model_dir \
+    data/plda_train_cmn $model_dir/xvectors_plda_train
+fi
+
+# Train PLDA models
+if [ $stage -le 11 ]; then
+  echo "$0: training PLDA model"
+  $train_cmd $model_dir/xvectors_plda_train/log/plda.log \
+    ivector-compute-plda ark:$model_dir/xvectors_plda_train/spk2utt \
+      "ark:ivector-subtract-global-mean \
+      scp:$model_dir/xvectors_plda_train/xvector.scp ark:- \
+      | transform-vec $model_dir/xvectors_plda_train/transform.mat ark:- ark:- \
+      | ivector-normalize-length ark:- ark:- |" \
+      $model_dir/xvectors_plda_train/plda || exit 1;
+  cp $model_dir/xvectors_plda_train/plda $model_dir/
+  cp $model_dir/xvectors_plda_train/transform.mat $model_dir/
+  cp $model_dir/xvectors_plda_train/mean.vec $model_dir/
+fi
diff --git a/egs/libri_css/s5_mono/local/wer_output_filter b/egs/libri_css/s5_mono/local/wer_output_filter
new file mode 100755
index 00000000000..6f4b6400716
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/wer_output_filter
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2017  Johns Hopkins University (Author: Yenda Trmal <jtrmal@gmail.com>)
+# Apache 2.0
+
+
+## Filter for scoring of the STT results. Convert everything to lowercase
+## and add some ad-hoc fixes for the hesitations
+
+perl -e '
+   while(<STDIN>) {
+     @A  = split(" ", $_);
+     $id = shift @A; print "$id ";
+     foreach $a (@A) {
+       print lc($a) . " " unless $a =~ /\[.*\]/;
+     }
+     print "\n";
+    }' | \
+sed -e '
+    s/\<mhm\>/hmm/g;
+    s/\<mm\>/hmm/g;
+    s/\<mmm\>/hmm/g;
+'
+
+#| uconv -f  utf-8  -t utf-8 -x Latin-ASCII
+
diff --git a/egs/libri_css/s5_mono/path.sh b/egs/libri_css/s5_mono/path.sh
new file mode 100644
index 00000000000..ab1a81a86ef
--- /dev/null
+++ b/egs/libri_css/s5_mono/path.sh
@@ -0,0 +1,10 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH
+export PATH=$PWD/dscore:$PATH
+export PYTHONPATH="${PYTHONPATH}:$PWD/dscore"
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+export BASH_ENV="~/.aliases"
+
diff --git a/egs/libri_css/s5_mono/rnnlm b/egs/libri_css/s5_mono/rnnlm
new file mode 120000
index 00000000000..e136939ba72
--- /dev/null
+++ b/egs/libri_css/s5_mono/rnnlm
@@ -0,0 +1 @@
+../../../scripts/rnnlm/
\ No newline at end of file
diff --git a/egs/libri_css/s5_mono/run.sh b/egs/libri_css/s5_mono/run.sh
new file mode 100755
index 00000000000..7b5f4b8e350
--- /dev/null
+++ b/egs/libri_css/s5_mono/run.sh
@@ -0,0 +1,99 @@
+#!/usr/bin/env bash
+#
+# LibriCSS monoaural baseline recipe.
+#
+# Copyright  2020  Johns Hopkins University (Author: Desh Raj)
+# Apache 2.0
+
+# Begin configuration section.
+nj=50
+decode_nj=20
+stage=0
+
+# Different stages
+asr_stage=1
+diarizer_stage=0
+decode_stage=0
+rnnlm_rescore=true
+
+data_affix=  # This can be used to distinguish between different data sources
+
+use_oracle_segments=false
+wpe=false
+
+# End configuration section
+. ./utils/parse_options.sh
+
+. ./cmd.sh
+. ./path.sh
+
+test_sets="dev${data_affix} eval${data_affix}"
+
+set -e # exit on error
+
+# please change the path accordingly
+libricss_corpus=/export/fs01/LibriCSS
+librispeech_corpus=/export/corpora5/LibriSpeech/
+
+##########################################################################
+# We first prepare the LibriCSS data (monoaural) in the Kaldi data
+# format. We use session 0 for dev and others for eval.
+##########################################################################
+if [ $stage -le 0 ]; then
+  local/data_prep_mono.sh --data-affix "$data_affix" $libricss_corpus $librispeech_corpus
+fi
+
+#########################################################################
+# ASR MODEL TRAINING
+# In this stage, we prepare the Librispeech data and train our ASR model. 
+# This part is taken from the librispeech recipe, with parts related to 
+# decoding removed. We use the 100h clean subset to train most of the
+# GMM models, except the SAT model, which is trained on the 460h clean
+# subset. The nnet is trained on the full 960h (clean + other).
+# To avoid training the whole ASR from scratch, you can download the
+# chain model using:
+# wget http://kaldi-asr.org/models/13/0013_librispeech_s5.tar.gz
+# Once it is downloaded, extract using: tar -xvzf 0013_librispeech_s5.tar.gz
+# and copy the contents of the exp/ directory to your exp/. 
+#########################################################################
+if [ $stage -le 1 ]; then
+  local/train_asr.sh --stage $asr_stage --nj $nj $librispeech_corpus
+fi
+
+##########################################################################
+# DIARIZATION MODEL TRAINING
+# You can also download a pretrained diarization model using:
+# wget http://kaldi-asr.org/models/12/0012_diarization_v1.tar.gz
+# Once it is downloaded, extract using: tar -xvzf 0012_diarization_v1.tar.gz
+# and copy the contents of the exp/ directory to your exp/
+##########################################################################
+if [ $stage -le 2 ]; then
+  local/train_diarizer.sh --stage $diarizer_stage \
+    --data-dir data/train_other_500 \
+    --model-dir exp/xvector_nnet_1a
+fi
+
+##########################################################################
+# RNNLM TRAINING
+# We train a TDNN-LSTM based LM that will be used for rescoring the 
+# decoded lattices.
+##########################################################################
+if [ $stage -le 3 ]; then
+  local/rnnlm/train.sh --stage $rnnlm_stage
+fi
+
+##########################################################################
+# DECODING: We assume that we are just given the raw recordings (approx 10
+# mins each), without segments or speaker information, so we have to decode 
+# the whole pipeline, i.e., SAD -> Diarization -> ASR. This is done in the 
+# local/decode.sh script.
+##########################################################################
+if [ $stage -le 4 ]; then
+  local/decode.sh --stage $decode_stage \
+    --test-sets "$test_sets" \
+    --use-oracle-segments $use_oracle_segments \
+    --rnnlm-rescore $rnnlm_rescore
+fi
+
+exit 0;
+
diff --git a/egs/libri_css/s5_mono/sid b/egs/libri_css/s5_mono/sid
new file mode 120000
index 00000000000..893a12f30c9
--- /dev/null
+++ b/egs/libri_css/s5_mono/sid
@@ -0,0 +1 @@
+../../sre08/v1/sid
\ No newline at end of file
diff --git a/egs/libri_css/s5_mono/steps b/egs/libri_css/s5_mono/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/libri_css/s5_mono/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/libri_css/s5_mono/utils b/egs/libri_css/s5_mono/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/libri_css/s5_mono/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file