diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4q.sh b/egs/swbd/s5c/local/chain/run_tdnn_4q.sh
new file mode 100755
index 00000000000..7d91e5e66f5
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4q.sh
@@ -0,0 +1,181 @@
+#!/bin/bash
+
+# this is based on Dan's tdnn_2o script
+# it has a different splicing configuration
+# it uses the PerDimensionWeightedAverage pooling in place of the Jesus layer
+
+# it relies on new steps/nnet3/chain/train_tdnn_b.sh script which accepts more 
+# parameters are calls new config generator steps/nnet3/tdnn/make_configs.py
+# which is more in line with other config generators.
+
+set -e
+
+#%WER 11.1 | 1831 21395 | 90.2 6.3 3.5 1.3 11.1 46.6 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys
+#%WER 16.6 | 4459 42989 | 85.2 9.5 5.3 1.8 16.6 53.4 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#%WER 15.59 [ 7671 / 49204, 883 ins, 2234 del, 4554 sub ] exp/chain/tdnn_v1_trial6_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+
+
+# configs for 'chain'
+affix=
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4q  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,0,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+# smoothing options
+pool_window=7 
+pool_type='per-dim-weighted-average'
+pool_lpfilter_width=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=64
+relu_dim=700
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2o_tree$suffix
+lang=data/lang_chain_2o
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn_b.sh --stage $train_stage \
+    --apply-deriv-weights false \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --pool-type "$pool_type" \
+    --pool-window "$pool_window" \
+    --pool-lpfilter-width "$pool_lpfilter_width" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim $relu_dim \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    --egs-dir "$common_egs_dir" \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
index 46ba6d13925..ea3898b83da 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
@@ -41,6 +41,7 @@ frames_per_iter=800000  # each iteration of training, see this many [input]
                         # frames per job.  This option is passed to get_egs.sh.
                         # Aim for about a minute of training time
 right_tolerance=10
+left_tolerance=5
 denominator_scale=1.0 # relates to tombsone stuff.
 num_jobs_initial=1  # Number of neural net jobs to run in parallel at the start of training
 num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
@@ -73,6 +74,10 @@ exit_stage=-100 # you can set this to terminate the training early.  Exits befor
 
 # count space-separated fields in splice_indexes to get num-hidden-layers.
 splice_indexes="-4,-3,-2,-1,0,1,2,3,4  0  -2,2  0  -4,4 0"
+pool_type='none'
+pool_window=
+pool_lpfilter_width=
+
 # Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
 # note: hidden layers which are composed of one or more components,
 # so hidden layer indexing is different from component count
@@ -219,8 +224,14 @@ if [ $stage -le -5 ]; then
     else
       dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
     fi
+
     # create the config files for nnet initialization
-    python steps/nnet3/make_tdnn_configs.py \
+    pool_opts=
+    pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+    pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+    pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+
+    python steps/nnet3/tdnn/make_configs.py $pool_opts\
       --include-log-softmax=false \
       --final-layer-normalize-target $final_layer_normalize_target \
       --splice-indexes "$splice_indexes"  \
@@ -231,6 +242,7 @@ if [ $stage -le -5 ]; then
       --use-presoftmax-prior-scale false \
       $dir/configs || exit 1;
   fi
+
   # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
   # matrix.  This first config just does any initial splicing that we do;
   # we do this as it's a convenient way to get the stats for the 'lda-like'
@@ -264,6 +276,8 @@ if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then
   steps/nnet3/chain/get_egs.sh $egs_opts "${extra_opts[@]}" \
       --frames-per-iter $frames_per_iter --stage $get_egs_stage \
       --cmd "$cmd" \
+      --right-tolerance "$right_tolerance" \
+      --left-tolerance "$left_tolerance" \
       --frames-per-eg $frames_per_eg \
       --frame-subsampling-factor $frame_subsampling_factor \
       $data $dir $latdir $dir/egs || exit 1;
diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py
index 87323a1c3e1..0383a9ff6f3 100644
--- a/egs/wsj/s5/steps/nnet3/components.py
+++ b/egs/wsj/s5/steps/nnet3/components.py
@@ -6,6 +6,45 @@
 import sys
 import warnings
 import copy
+from operator import itemgetter
+import numpy as np
+try:
+    import scipy.signal as signal
+    has_scipy_signal = True
+except ImportError:
+    has_scipy_signal = False
+
+def WriteKaldiMatrix(matrix, matrix_file_name):
+    assert(len(matrix.shape) == 2)
+    # matrix is a numpy array
+    matrix_file = open(matrix_file_name, "w")
+    [rows, cols ] = matrix.shape
+    matrix_file.write('[\n')
+    for row in range(rows):
+        matrix_file.write(' '.join( map(lambda x: '{0:f}'.format(x), matrix[row, : ])))
+        if row == rows - 1:
+            matrix_file.write("]")
+        else:
+            matrix_file.write('\n')
+    matrix_file.close()
+def GetSumDescriptor(inputs):
+    sum_descriptors = inputs
+    while len(sum_descriptors) != 1:
+        cur_sum_descriptors = []
+        pair = []
+        while len(sum_descriptors) > 0:
+            value = sum_descriptors.pop()
+            if value.strip() != '':
+                pair.append(value)
+            if len(pair) == 2:
+                cur_sum_descriptors.append("Sum({0}, {1})".format(pair[0], pair[1]))
+                pair = []
+        if pair:
+            cur_sum_descriptors.append(pair[0])
+        sum_descriptors = cur_sum_descriptors
+    return sum_descriptors
+
+
 
 # adds the input nodes and returns the descriptor
 def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0):
@@ -19,11 +58,26 @@ def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0):
         components.append('input-node name=ivector dim=' + str(ivector_dim))
         list.append('ReplaceIndex(ivector, t, 0)')
         output_dim += ivector_dim
-    splice_descriptor = "Append({0})".format(", ".join(list))
+    if len(list) > 1:
+        splice_descriptor = "Append({0})".format(", ".join(list))
+    else:
+        splice_descriptor = list[0]
     print(splice_descriptor)
     return {'descriptor': splice_descriptor,
             'dimension': output_dim}
 
+def AddNoOpLayer(config_lines, name, input):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+
+    components.append('component name={0}_noop type=NoOpComponent dim={1}'.format(name, input['dimension']))
+    component_nodes.append('component-node name={0}_noop component={0}_noop input={1}'.format(name, input['descriptor']))
+
+    return {'descriptor':  '{0}_noop'.format(name),
+            'dimension': input['dimension']}
+
+
+
 def AddLdaLayer(config_lines, name, input, lda_file):
     components = config_lines['components']
     component_nodes = config_lines['component-nodes']
@@ -34,6 +88,30 @@ def AddLdaLayer(config_lines, name, input, lda_file):
     return {'descriptor':  '{0}_lda'.format(name),
             'dimension': input['dimension']}
 
+def AddBlockAffineLayer(config_lines, name, input, output_dim, num_blocks):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+    assert((input['dimension'] % num_blocks == 0) and
+            (output_dim % num_blocks == 0))
+    components.append('component name={0}_block_affine type=BlockAffineComponent input-dim={1} output-dim={2} num-blocks={3}'.format(name, input['dimension'], output_dim, num_blocks))
+    component_nodes.append('component-node name={0}_block_affine component={0}_block_affine input={1}'.format(name, input['descriptor']))
+
+    return {'descriptor' : '{0}_block_affine'.format(name),
+                           'dimension' : output_dim}
+
+
+def AddPermuteLayer(config_lines, name, input, column_map):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+    permute_indexes = ",".join(map(lambda x: str(x), column_map))
+    components.append('component name={0}_permute type=PermuteComponent column-map={1}'.format(name, permute_indexes))
+    component_nodes.append('component-node name={0}_permute component={0}_permute input={1}'.format(name, input['descriptor']))
+
+    return {'descriptor': '{0}_permute'.format(name),
+            'dimension': input['dimension']}
+
+
+
 def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = ""):
     components = config_lines['components']
     component_nodes = config_lines['component-nodes']
@@ -60,6 +138,36 @@ def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options
             'dimension': output_dim}
 
 
+def AddConvolutionLayer(config_lines, name, input,
+                       input_x_dim, input_y_dim, input_z_dim,
+                       filt_x_dim, filt_y_dim,
+                       filt_x_step, filt_y_step,
+                       num_filters, input_vectorization,
+                       param_stddev = None, bias_stddev = None,
+                       filter_bias_file = None,
+                       is_updatable = True):
+    assert(input['dimension'] == input_x_dim * input_y_dim * input_z_dim)
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+
+    conv_init_string = "component name={0}_conv type=ConvolutionComponent input-x-dim={1} input-y-dim={2} input-z-dim={3} filt-x-dim={4} filt-y-dim={5} filt-x-step={6} filt-y-step={7} input-vectorization-order={8}".format(name, input_x_dim, input_y_dim, input_z_dim, filt_x_dim, filt_y_dim, filt_x_step, filt_y_step, input_vectorization)
+    if filter_bias_file is not None:
+        conv_init_string += " matrix={0}".format(filter_bias_file)
+    if is_updatable:
+        conv_init_string += " is-updatable=true"
+    else:
+        conv_init_string += " is-updatable=false"
+
+    components.append(conv_init_string)
+    component_nodes.append("component-node name={0}_conv_t component={0}_conv input={1}".format(name, input['descriptor']))
+
+    num_x_steps = (1 + (input_x_dim - filt_x_dim) / filt_x_step)
+    num_y_steps = (1 + (input_y_dim - filt_y_dim) / filt_y_step)
+    output_dim = num_x_steps * num_y_steps * num_filters;
+    return {'descriptor':  '{0}_conv_t'.format(name),
+            'dimension': output_dim}
+
+
 
 def AddSoftmaxLayer(config_lines, name, input):
     components = config_lines['components']
@@ -72,7 +180,7 @@ def AddSoftmaxLayer(config_lines, name, input):
             'dimension': input['dimension']}
 
 
-def AddOutputNode(config_lines, input, label_delay=None):
+def AddOutputLayer(config_lines, input, label_delay=None):
     components = config_lines['components']
     component_nodes = config_lines['component-nodes']
     if label_delay is None:
@@ -80,12 +188,18 @@ def AddOutputNode(config_lines, input, label_delay=None):
     else:
         component_nodes.append('output-node name=output input=Offset({0},{1})'.format(input['descriptor'], label_delay))
 
-def AddFinalLayer(config_lines, input, output_dim, ng_affine_options = "", label_delay=None, include_softmax = "true"):
+def AddFinalLayer(config_lines, input, output_dim, ng_affine_options = " param-stddev=0 bias-stddev=0 ", label_delay=None, use_presoftmax_prior_scale = False, prior_scale_file = None, include_log_softmax = True):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+    
     prev_layer_output = AddAffineLayer(config_lines, "Final", input, output_dim, ng_affine_options)
-    if include_softmax == "true":
-      prev_layer_output = AddSoftmaxLayer(config_lines, "Final", prev_layer_output)
-    AddOutputNode(config_lines, prev_layer_output, label_delay)
-
+    if include_log_softmax:
+        if use_presoftmax_prior_scale :
+            components.append('component name=Final-fixed-scale type=FixedScaleComponent scales={0}'.format(prior_scale_file))
+            component_nodes.append('component-node name=Final-fixed-scale component=Final-fixed-scale input={0}'.format(prev_layer_output['descriptor']))
+            prev_layer_output['descriptor'] = "Final-fixed-scale"
+        prev_layer_output = AddSoftmaxLayer(config_lines, "Final", prev_layer_output)
+    AddOutputLayer(config_lines, prev_layer_output, label_delay)
 
 def AddLstmLayer(config_lines,
                  name, input, cell_dim,
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
new file mode 100755
index 00000000000..e100ac0f2af
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
@@ -0,0 +1,321 @@
+#!/usr/bin/env python
+
+# we're using python 3.x style print but want it to work in python 2.x,
+from __future__ import print_function
+import os
+import argparse
+import sys
+import warnings
+import copy
+import imp
+import ast
+import scipy.signal as signal
+import numpy as np
+
+nodes = imp.load_source('', 'steps/nnet3/nodes.py')
+
+
+def AddPerDimAffineLayer(config_lines, name, input, input_window):
+    filter_context = int((input_window - 1) / 2)
+    filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1)
+    list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes]
+    filter_input_descriptor = 'Append({0})'.format(' , '.join(list))
+    filter_input_descriptor = {'descriptor':filter_input_descriptor,
+                               'dimension':len(filter_input_splice_indexes) * input['dimension']}
+
+
+    # add permute component to shuffle the feature columns of the Append
+    # descriptor output so that columns corresponding to the same feature index
+    # are contiguous add a block-affine component to collapse all the feature
+    # indexes across time steps into a single value
+    num_feats = input['dimension']
+    num_times = len(filter_input_splice_indexes)
+    column_map = []
+    for i in range(num_feats):
+        for j in range(num_times):
+            column_map.append(j * num_feats + i) 
+    permuted_output_descriptor = nodes.AddPermuteLayer(config_lines,
+            name, filter_input_descriptor, column_map)
+
+    # add a block-affine component
+    output_descriptor = nodes.AddBlockAffineComponent(config_lines, name,
+                                                      permuted_output_descriptor,
+                                                      num_feats, num_feats)
+
+    return [output_descriptor, filter_context, filter_context]
+
+
+def AddLpFilter(config_lines, name, input, rate, num_lpfilter_taps, lpfilt_filename, is_updatable = False):
+    # low-pass smoothing of input was specified. so we will add a low-pass filtering layer
+    lp_filter = signal.firwin(num_lpfilter_taps, rate, width=None, window='hamming', pass_zero=True, scale=True, nyq=1.0)
+    lp_filter = np.append(lp_filter, 0)
+    nodes.WriteKaldiMatrix(np.array([lp_filter]), lpfilt_filename)
+    filter_context = int((num_lpfilter_taps - 1) / 2)
+    filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1)
+    list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes]
+    filter_input_descriptor = 'Append({0})'.format(' , '.join(list))
+    filter_input_descriptor = {'descriptor':filter_input_descriptor,
+                               'dimension':len(filter_input_splice_indexes) * input['dimension']}
+
+    input_x_dim = len(filter_input_splice_indexes)
+    input_y_dim = input['dimension']
+    input_z_dim = 1
+    filt_x_dim = len(filter_input_splice_indexes)
+    filt_y_dim = 1
+    filt_x_step = 1
+    filt_y_step = 1
+    input_vectorization = 'zyx'
+
+    tdnn_input_descriptor = nodes.AddConvolutionLayer(config_lines, name,
+                                                     filter_input_descriptor,
+                                                     input_x_dim, input_y_dim, input_z_dim,
+                                                     filt_x_dim, filt_y_dim,
+                                                     filt_x_step, filt_y_step,
+                                                     1, input_vectorization,
+                                                     filter_bias_file = lpfilt_filename,
+                                                     is_updatable = is_updatable)
+
+
+    return [tdnn_input_descriptor, filter_context, filter_context]
+
+
+
+def PrintConfig(file_name, config_lines):
+    f = open(file_name, 'w')
+    f.write("\n".join(config_lines['components'])+"\n")
+    f.write("\n#Component nodes\n")
+    f.write("\n".join(config_lines['component-nodes']))
+    f.close()
+
+def ParseSpliceString(splice_indexes, label_delay=None):
+    ## Work out splice_array e.g. splice_array = [ [ -3,-2,...3 ], [0], [-2,2], .. [ -8,8 ] ]
+    splice_array = []
+    left_context = 0
+    right_context = 0
+    split1 = args.splice_indexes.split(" ");  # we already checked the string is nonempty.
+    if len(split1) < 1:
+        sys.exit("invalid --splice-indexes argument, too short: "
+                 + args.splice_indexes)
+    try:
+        for string in split1:
+            split2 = string.split(",")
+            if len(split2) < 1:
+                sys.exit("invalid --splice-indexes argument, too-short element: "
+                         + args.splice_indexes)
+            int_list = []
+            for int_str in split2:
+                int_list.append(int(int_str))
+            if not int_list == sorted(int_list):
+                sys.exit("elements of --splice-indexes must be sorted: "
+                         + args.splice_indexes)
+            left_context += -int_list[0]
+            right_context += int_list[-1]
+            splice_array.append(int_list)
+    except ValueError as e:
+        sys.exit("invalid --splice-indexes argument " + args.splice_indexes + e)
+    left_context = max(0, left_context)
+    right_context = max(0, right_context)
+    num_hidden_layers = len(splice_array)
+    input_dim = len(splice_array[0]) * args.feat_dim  +  args.ivector_dim
+
+    return {'left_context':left_context,
+            'right_context':right_context,
+            'splice_indexes':splice_array,
+            'num_hidden_layers':len(splice_array)
+            }
+
+if __name__ == "__main__":
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description="Writes config files and variables "
+                                                 "for TDNNs creation and training",
+                                     epilog="See steps/nnet3/tdnn/train.sh for example.")
+    # General neural network options
+    parser.add_argument("--splice-indexes", type=str,
+                        help="Splice indexes at input layer, e.g. '-3,-2,-1,0,1,2,3' [compulsary argument]", default="0")
+    parser.add_argument("--feat-dim", type=int,
+                        help="Raw feature dimension, e.g. 13")
+    parser.add_argument("--ivector-dim", type=int,
+                        help="iVector dimension, e.g. 100", default=0)
+    parser.add_argument("--include-log-softmax", type=str,
+                        help="add the final softmax layer ", default="true", choices = ["false", "true"])
+    parser.add_argument("--final-layer-normalize-target", type=float,
+                        help="RMS target for final layer (set to <1 if final layer learns too fast",
+                        default=1.0)
+    parser.add_argument("--subset-dim", type=int, default=0,
+                        help="dimension of the subset of units to be sent to the central frame")
+    parser.add_argument("--pnorm-input-dim", type=int,
+                        help="input dimension to p-norm nonlinearities")
+    parser.add_argument("--pnorm-output-dim", type=int,
+                        help="output dimension of p-norm nonlinearities")
+    parser.add_argument("--relu-dim", type=int,
+                        help="dimension of ReLU nonlinearities")
+    parser.add_argument("--pool-type", type=str, default = 'none',
+                        help="Type of pooling to be used.", choices = ['low-pass', 'sum', 'max', 'weighted-average', 'per-dim-weighted-average', 'none'])
+    parser.add_argument("--pool-window", type=int, default = None,
+                        help="Width of the pooling window")
+    parser.add_argument("--pool-lpfilter-width", type=float,
+                        default = None, help="Nyquist frequency of the lpfilter to be used for pooling")
+    parser.add_argument("--use-presoftmax-prior-scale", type=str,
+                        help="if true, a presoftmax-prior-scale is added",
+                        choices=['true', 'false'], default = "true")
+    parser.add_argument("--num-targets", type=int,
+                        help="number of network targets (e.g. num-pdf-ids/num-leaves)")
+    parser.add_argument("config_dir",
+                        help="Directory to write config files and variables")
+
+    print(' '.join(sys.argv))
+
+    args = parser.parse_args()
+    
+    if not os.path.exists(args.config_dir):
+        os.makedirs(args.config_dir)
+
+    ## Check arguments.
+    if args.splice_indexes is None:
+        sys.exit("--splice-indexes argument is required")
+    if args.feat_dim is None or not (args.feat_dim > 0):
+        sys.exit("--feat-dim argument is required")
+    if args.num_targets is None or not (args.num_targets > 0):
+        sys.exit("--num-targets argument is required")
+    if (args.subset_dim < 0):
+        sys.exit("--subset-dim has to be non-negative")
+    if (args.pool_window is not None) and (args.pool_window <= 0):
+        sys.exit("--pool-window has to be positive")
+
+    if not args.relu_dim is None:
+        if not args.pnorm_input_dim is None or not args.pnorm_output_dim is None:
+            sys.exit("--relu-dim argument not compatible with "
+                     "--pnorm-input-dim or --pnorm-output-dim options");
+        nonlin_input_dim = args.relu_dim
+        nonlin_output_dim = args.relu_dim
+    else:
+        if not args.pnorm_input_dim > 0 or not args.pnorm_output_dim > 0:
+            sys.exit("--relu-dim not set, so expected --pnorm-input-dim and "
+                     "--pnorm-output-dim to be provided.");
+        nonlin_input_dim = args.pnorm_input_dim
+        nonlin_output_dim = args.pnorm_output_dim
+
+    prior_scale_file = '{0}/presoftmax_prior_scale.vec'.format(args.config_dir)
+    if args.use_presoftmax_prior_scale == "true":
+        use_presoftmax_prior_scale = True
+    else:
+        use_presoftmax_prior_scale = False
+
+    parsed_splice_output = ParseSpliceString(args.splice_indexes.strip())
+    num_hidden_layers = parsed_splice_output['num_hidden_layers']
+    splice_indexes = parsed_splice_output['splice_indexes']
+
+    config_lines = {'components':[], 'component-nodes':[]}
+
+    config_files={}
+    prev_layer_output = nodes.AddInputLayer(config_lines, args.feat_dim, splice_indexes[0], args.ivector_dim)
+
+    # Add the init config lines for estimating the preconditioning matrices
+    init_config_lines = copy.deepcopy(config_lines)
+    init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to')
+    init_config_lines['components'].insert(0, '# preconditioning matrix computation')
+    nodes.AddOutputLayer(init_config_lines, prev_layer_output)
+    config_files[args.config_dir + '/init.config'] = init_config_lines
+
+    prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, args.config_dir + '/lda.mat')
+
+    left_context = 0
+    right_context = 0
+    # we moved the first splice layer to before the LDA..
+    # so the input to the first affine layer is going to [0] index
+    splice_indexes[0] = [0]
+    for i in range(0, num_hidden_layers):
+        # make the intermediate config file for layerwise discriminative training
+        # if specified, pool the input from the previous layer
+
+        # prepare the spliced input
+        if not (len(splice_indexes[i]) == 1 and splice_indexes[i][0] == 0):
+            if args.pool_type != "none" and args.pool_window is None:
+                raise Exception("Pooling type was specified as {0}, this requires specification of the pool-window".format(args.pool_type))
+            if args.pool_type in set(["low-pass", "weighted-average"]):
+                if args.pool_type == "weighted-average":
+                    lpfilter_is_updatable = True
+                else:
+                    lpfilter_is_updatable = False
+                # low-pass filter the input to smooth it before the sub-sampling
+                [prev_layer_output, cur_left_context, cur_right_context] = AddLpFilter(config_lines,
+                                                                                      'Tdnn_input_smoother_{0}'.format(i),
+                                                                                       prev_layer_output,
+                                                                                       args.pool_lpfilter_width,
+                                                                                       args.pool_window,
+                                                                                       args.config_dir + '/Tdnn_input_smoother_{0}.txt'.format(i),
+                                                                                       is_updatable = lpfilter_is_updatable)
+                left_context += cur_left_context
+                right_context += cur_right_context
+
+            if args.pool_type == "per-dim-weighted-average":
+                # add permute component to shuffle the feature columns of the Append descriptor output so
+                # that columns corresponding to the same feature index are contiguous
+                # add a block-affine component to collapse all the feature indexes across time steps into a single value
+                [prev_layer_output, cur_left_context, cur_right_context] = AddPerDimAffineLayer(config_lines,
+                                                                                            'Tdnn_input_PDA_{0}'.format(i),
+                                                                                            prev_layer_output,
+                                                                                            args.pool_window)
+    
+                left_context += cur_left_context
+                right_context += cur_right_context
+
+            if args.pool_type == "sum":
+                raise NotImplementedError("Sum-pooling has not been tested yet.")
+
+            if args.pool_type == "max" :
+                raise NotImplementedError("Max-pooling component needs to be reimplemented for this.")
+
+            try:
+                zero_index = splice_indexes[i].index(0)
+            except ValueError:
+                zero_index = None
+            # I just assume the prev_layer_output_descriptor is a simple forwarding descriptor
+            prev_layer_output_descriptor = prev_layer_output['descriptor']
+            subset_output = prev_layer_output
+            if args.subset_dim > 0:
+                # if subset_dim is specified the script expects a zero in the splice indexes
+                assert(zero_index is not None)
+                subset_node_config = "dim-range-node name=Tdnn_input_{0} input-node={1} dim-offset={2} dim={3}".format(i, prev_layer_output_descriptor, 0, args.subset_dim)
+                subset_output = {'descriptor' : 'Tdnn_input_{0}'.format(i),
+                                 'dimension' : args.subset_dim}
+                config_lines['component-nodes'].append(subset_node_config)
+            appended_descriptors = []
+            appended_dimension = 0
+            for j in range(len(splice_indexes[i])):
+                if j == zero_index:
+                    appended_descriptors.append(prev_layer_output['descriptor'])
+                    appended_dimension += prev_layer_output['dimension']
+                    continue
+                appended_descriptors.append('Offset({0}, {1})'.format(subset_output['descriptor'], splice_indexes[i][j]))
+                appended_dimension += subset_output['dimension']
+            prev_layer_output = {'descriptor' : "Append({0})".format(" , ".join(appended_descriptors)),
+                                 'dimension'  : appended_dimension}
+        else:
+            # this is a normal affine node
+            pass
+        prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn_{0}".format(i),
+                                                    prev_layer_output, nonlin_output_dim, norm_target_rms = 1.0 if i < num_hidden_layers -1 else args.final_layer_normalize_target)
+        # a final layer is added after each new layer as we are generating configs for layer-wise discriminative training
+        nodes.AddFinalLayer(config_lines, prev_layer_output, args.num_targets,
+                           use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                           prior_scale_file = prior_scale_file,
+                           include_log_softmax = True if args.include_log_softmax == "true" else False)
+
+        config_files['{0}/layer{1}.config'.format(args.config_dir, i+1)] = config_lines
+        config_lines = {'components':[], 'component-nodes':[]}
+
+    left_context += int(parsed_splice_output['left_context'])
+    right_context += int(parsed_splice_output['right_context'])
+
+    # write the files used by other scripts like steps/nnet3/get_egs.sh
+    f = open(args.config_dir + "/vars", "w")
+    print('left_context=' + str(left_context), file=f)
+    print('right_context=' + str(right_context), file=f)
+    print('num_hidden_layers=' + str(num_hidden_layers), file=f)
+    f.close()
+
+    # printing out the configs
+    # init.config used to train lda-mllt train
+    for key in config_files.keys():
+        PrintConfig(key, config_files[key])
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/train.sh b/egs/wsj/s5/steps/nnet3/tdnn/train.sh
new file mode 100755
index 00000000000..773e10ccab6
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/tdnn/train.sh
@@ -0,0 +1,660 @@
+#!/bin/bash
+
+# note, TDNN is the same as what we used to call multisplice.
+
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
+#           2013  Xiaohui Zhang
+#           2013  Guoguo Chen
+#           2014  Vimal Manohar
+#           2014  Vijayaditya Peddinti
+# Apache 2.0.
+
+
+# Begin configuration section.
+cmd=run.pl
+num_epochs=15      # Number of epochs of training;
+                   # the number of iterations is worked out from this.
+initial_effective_lrate=0.01
+final_effective_lrate=0.001
+pnorm_input_dim=3000
+pnorm_output_dim=300
+relu_dim=  # you can use this to make it use ReLU's instead of p-norms.
+rand_prune=4.0 # Relates to a speedup we do for LDA.
+minibatch_size=512  # This default is suitable for GPU-based training.
+                    # Set it to 128 for multi-threaded CPU-based training.
+max_param_change=2.0  # max param change per minibatch
+samples_per_iter=400000 # each iteration of training, see this many samples
+                        # per job.  This option is passed to get_egs.sh
+num_jobs_initial=1  # Number of neural net jobs to run in parallel at the start of training
+num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
+prior_subset_size=20000 # 20k samples per job, for computing priors.
+num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
+get_egs_stage=0    # can be used for rerunning after partial
+online_ivector_dir=
+presoftmax_prior_scale_power=-0.25
+use_presoftmax_prior_scale=true
+remove_egs=true  # set to false to disable removing egs after training is done.
+
+max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
+  # to the final 'combine' stage, but these models will themselves be averages of
+  # iteration-number ranges.
+
+shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
+                # on each iter.  You could set it to 0 or to a large value for complete
+                # randomization, but this would both consume memory and cause spikes in
+                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
+                # not a huge deal though, as samples are anyway randomized right at the start.
+                # (the point of this is to get data in different minibatches on different iterations,
+                # since in the preconditioning method, 2 samples in the same minibatch can
+                # affect each others' gradients.
+
+add_layers_period=2 # by default, add new layers every 2 iterations.
+stage=-6
+exit_stage=-100 # you can set this to terminate the training early.  Exits before running this stage
+
+# count space-separated fields in splice_indexes to get num-hidden-layers.
+splice_indexes="-4,-3,-2,-1,0,1,2,3,4  0  -2,2  0  -4,4 0"
+# Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
+# note: hidden layers which are composed of one or more components,
+# so hidden layer indexing is different from component count
+chunk_training=false  # if true training is done with chunk randomization, rather than frame randomization
+
+randprune=4.0 # speeds up LDA.
+use_gpu=true    # if true, we run on GPU.
+cleanup=true
+egs_dir=
+max_lda_jobs=10  # use no more than 10 jobs for the LDA accumulation.
+lda_opts=
+egs_opts=
+transform_dir=     # If supplied, this dir used instead of alidir to find transforms.
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
+            # only relevant for "raw" features, not lda.
+feat_type=raw  # or set to 'lda' to use LDA features.
+align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
+align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
+realign_times=          # List of times on which we realign.  Each time is
+                        # floating point number strictly between 0 and 1, which
+                        # will be multiplied by the num-iters to get an iteration
+                        # number.
+num_jobs_align=30       # Number of jobs for realignment
+# End configuration section.
+frames_per_eg=8 # to be passed on to get_egs.sh
+subset_dim=0
+
+trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
+  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
+  echo "  --initial-effective-lrate <lrate|0.02> # effective learning rate at start of training."
+  echo "  --final-effective-lrate <lrate|0.004>   # effective learning rate at end of training."
+  echo "                                                   # data, 0.00025 for large data"
+  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
+  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
+  echo "  --presoftmax-prior-scale-power <power|-0.25>     # use the specified power value on the priors (inverse priors) to scale"
+  echo "                                                   # the pre-softmax outputs (set to 0.0 to disable the presoftmax element scale)"
+  echo "  --num-jobs-initial <num-jobs|1>                  # Number of parallel jobs to use for neural net training, at the start."
+  echo "  --num-jobs-final <num-jobs|8>                    # Number of parallel jobs to use for neural net training, at the end"
+  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
+  echo "                                                   # results as well as speed; may interact with batch size; if you increase"
+  echo "                                                   # this, you may want to decrease the batch size."
+  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
+  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
+  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
+  echo "                                                   # should not get too large, e.g. >2k)."
+  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
+  echo "                                                   # process."
+  echo "  --splice-indexes <string|layer0/-4:-3:-2:-1:0:1:2:3:4> "
+  echo "                                                   # Frame indices used for each splice layer."
+  echo "                                                   # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
+  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
+  echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
+  echo "  --realign-times <list-of-times|\"\">             # A list of space-separated floating point numbers between 0.0 and"
+  echo "                                                   # 1.0 to specify how far through training realignment is to be done"
+  echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
+  echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
+  echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
+  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+
+
+  exit 1;
+fi
+
+data=$1
+lang=$2
+alidir=$3
+dir=$4
+
+if [ ! -z "$realign_times" ]; then
+  [ -z "$align_cmd" ] && echo "$0: realign_times specified but align_cmd not specified" && exit 1
+  [ -z "$align_use_gpu" ] && echo "$0: realign_times specified but align_use_gpu not specified" && exit 1
+fi
+
+# Check some files.
+for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+
+# Set some variables.
+num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
+[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
+[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1
+
+nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
+# in this dir we'll have just one job.
+sdata=$data/split$nj
+utils/split_data.sh $data $nj
+
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+cp $alidir/tree $dir
+
+
+# First work out the feature and iVector dimension, needed for tdnn config creation.
+case $feat_type in
+  raw) feat_dim=$(feat-to-dim --print-args=false scp:$data/feats.scp -) || \
+      { echo "$0: Error getting feature dim"; exit 1; }
+    ;;
+  lda)  [ ! -f $alidir/final.mat ] && echo "$0: With --feat-type lda option, expect $alidir/final.mat to exist."
+   # get num-rows in lda matrix, which is the lda feature dim.
+   feat_dim=$(matrix-dim --print-args=false $alidir/final.mat | cut -f 1)
+    ;;
+  *)
+   echo "$0: Bad --feat-type '$feat_type';"; exit 1;
+esac
+if [ -z "$online_ivector_dir" ]; then
+  ivector_dim=0
+else
+  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
+fi
+
+
+if [ $stage -le -5 ]; then
+  echo "$0: creating neural net configs";
+
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  python steps/nnet3/tdnn/make_configs.py  \
+    --splice-indexes "$splice_indexes"  \
+    --subset-dim "$subset_dim" \
+    --feat-dim $feat_dim \
+    --ivector-dim $ivector_dim  \
+     $dim_opts \
+    --use-presoftmax-prior-scale $use_presoftmax_prior_scale \
+    --num-targets  $num_leaves  \
+   $dir/configs || exit 1;
+
+  # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
+  # matrix.  This first config just does any initial splicing that we do;
+  # we do this as it's a convenient way to get the stats for the 'lda-like'
+  # transform.
+  $cmd $dir/log/nnet_init.log \
+    nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1;
+fi
+
+# sourcing the "vars" below sets
+# left_context=(something)
+# right_context=(something)
+# num_hidden_layers=(something)
+. $dir/configs/vars || exit 1;
+
+context_opts="--left-context=$left_context --right-context=$right_context"
+
+! [ "$num_hidden_layers" -gt 0 ] && echo \
+ "$0: Expected num_hidden_layers to be defined" && exit 1;
+
+[ -z "$transform_dir" ] && transform_dir=$alidir
+
+
+if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then
+  extra_opts=()
+  [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
+  [ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
+  [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
+  extra_opts+=(--transform-dir $transform_dir)
+  extra_opts+=(--left-context $left_context)
+  extra_opts+=(--right-context $right_context)
+  echo "$0: calling get_egs.sh"
+  steps/nnet3/get_egs.sh $egs_opts "${extra_opts[@]}" \
+      --samples-per-iter $samples_per_iter --stage $get_egs_stage \
+      --cmd "$cmd" $egs_opts \
+      --frames-per-eg $frames_per_eg \
+      $data $alidir $dir/egs || exit 1;
+fi
+
+[ -z $egs_dir ] && egs_dir=$dir/egs
+
+if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then
+  echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)";
+  exit 1;
+fi
+if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then
+  echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $egs_dir/info/ivector_dim)";
+  exit 1;
+fi
+
+# copy any of the following that exist, to $dir.
+cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null
+
+# confirm that the egs_dir has the necessary context (especially important if
+# the --egs-dir option was used on the command line).
+egs_left_context=$(cat $egs_dir/info/left_context) || exit -1
+egs_right_context=$(cat $egs_dir/info/right_context) || exit -1
+ ( [ $egs_left_context -lt $left_context ] || \
+   [ $egs_right_context -lt $right_context ] ) && \
+   echo "$0: egs in $egs_dir have too little context" && exit -1;
+
+frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
+num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
+
+# num_archives_expanded considers each separate label-position from
+# 0..frames_per_eg-1 to be a separate archive.
+if [ "$chunk_training" == "true" ]; then
+  num_archives_expanded=$num_archives
+else
+  num_archives_expanded=$[$num_archives*$frames_per_eg]
+fi
+
+[ $num_jobs_initial -gt $num_jobs_final ] && \
+  echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;
+
+[ $num_jobs_final -gt $num_archives_expanded ] && \
+  echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1;
+
+
+if [ $stage -le -3 ]; then
+  echo "$0: getting preconditioning matrix for input features."
+  num_lda_jobs=$num_archives
+  [ $num_lda_jobs -gt $max_lda_jobs ] && num_lda_jobs=$max_lda_jobs
+
+  # Write stats with the same format as stats for LDA.
+  $cmd JOB=1:$num_lda_jobs $dir/log/get_lda_stats.JOB.log \
+      nnet3-acc-lda-stats --rand-prune=$rand_prune \
+        $dir/init.raw "ark:$egs_dir/egs.JOB.ark" $dir/JOB.lda_stats || exit 1;
+
+  all_lda_accs=$(for n in $(seq $num_lda_jobs); do echo $dir/$n.lda_stats; done)
+  $cmd $dir/log/sum_transform_stats.log \
+    sum-lda-accs $dir/lda_stats $all_lda_accs || exit 1;
+
+  rm $all_lda_accs || exit 1;
+
+  # this computes a fixed affine transform computed in the way we described in
+  # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
+  # of an LDA transform but without dimensionality reduction.
+  $cmd $dir/log/get_transform.log \
+     nnet-get-feature-transform $lda_opts $dir/lda.mat $dir/lda_stats || exit 1;
+
+  ln -sf ../lda.mat $dir/configs/lda.mat
+fi
+
+
+if [ $stage -le -2 ]; then
+  echo "$0: preparing initial vector for FixedScaleComponent before softmax"
+  echo "  ... using priors^$presoftmax_prior_scale_power and rescaling to average 1"
+
+  # obtains raw pdf count
+  $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \
+     ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
+     post-to-tacc --per-pdf=true  $alidir/final.mdl ark:- $dir/pdf_counts.JOB || exit 1;
+  $cmd $dir/log/sum_pdf_counts.log \
+       vector-sum --binary=false $dir/pdf_counts.* $dir/pdf_counts || exit 1;
+  rm $dir/pdf_counts.*
+
+  awk -v power=$presoftmax_prior_scale_power -v smooth=0.01 \
+     '{ for(i=2; i<=NF-1; i++) { count[i-2] = $i;  total += $i; }
+        num_pdfs=NF-2;  average_count = total/num_pdfs;
+        for (i=0; i<num_pdfs; i++) stot += (scale[i] = (count[i] + smooth * average_count)^power)
+        printf " [ "; for (i=0; i<num_pdfs; i++) printf("%f ", scale[i]*num_pdfs/stot); print "]" }' \
+     $dir/pdf_counts > $dir/presoftmax_prior_scale.vec
+  ln -sf ../presoftmax_prior_scale.vec $dir/configs/presoftmax_prior_scale.vec
+fi
+
+if [ $stage -le -1 ]; then
+  # Add the first layer; this will add in the lda.mat and
+  # presoftmax_prior_scale.vec.
+  $cmd $dir/log/add_first_layer.log \
+       nnet3-init --srand=-3 $dir/init.raw $dir/configs/layer1.config $dir/0.raw || exit 1;
+
+  # Convert to .mdl, train the transitions, set the priors.
+  $cmd $dir/log/init_mdl.log \
+    nnet3-am-init $alidir/final.mdl $dir/0.raw - \| \
+    nnet3-am-train-transitions - "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl || exit 1;
+fi
+
+
+# set num_iters so that as close as possible, we process the data $num_epochs
+# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded,
+# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+
+num_archives_to_process=$[$num_epochs*$num_archives_expanded]
+num_archives_processed=0
+num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]
+
+! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \
+  && echo "$0: Insufficient epochs" && exit 1
+
+finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
+
+echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
+
+if $use_gpu; then
+  parallel_suffix=""
+  train_queue_opt="--gpu 1"
+  combine_queue_opt="--gpu 1"
+  prior_gpu_opt="--use-gpu=yes"
+  prior_queue_opt="--gpu 1"
+  parallel_train_opts=
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are running with one thread but you have not compiled"
+    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
+    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
+    exit 1
+  fi
+else
+  echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
+  parallel_train_opts="--use-gpu=no"
+  combine_queue_opt=""  # the combine stage will be quite slow if not using
+                        # GPU, as we didn't enable that program to use
+                        # multiple threads.
+  prior_gpu_opt="--use-gpu=no"
+  prior_queue_opt=""
+fi
+
+
+approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final]
+# First work out how many iterations we want to combine over in the final
+# nnet3-combine-fast invocation.  (We may end up subsampling from these if the
+# number exceeds max_model_combine).  The number we use is:
+# min(max(max_models_combine, approx_iters_per_epoch_final),
+#     1/2 * iters_after_last_layer_added)
+num_iters_combine=$max_models_combine
+if [ $num_iters_combine -lt $approx_iters_per_epoch_final ]; then
+   num_iters_combine=$approx_iters_per_epoch_final
+fi
+half_iters_after_add_layers=$[($num_iters-$finish_add_layers_iter)/2]
+if [ $num_iters_combine -gt $half_iters_after_add_layers ]; then
+  num_iters_combine=$half_iters_after_add_layers
+fi
+first_model_combine=$[$num_iters-$num_iters_combine+1]
+
+x=0
+
+for realign_time in $realign_times; do
+  # Work out the iterations on which we will re-align, if the --realign-times
+  # option was used.  This is slightly approximate.
+  ! perl -e "exit($realign_time > 0.0 && $realign_time < 1.0 ? 0:1);" && \
+    echo "Invalid --realign-times option $realign_times: elements must be strictly between 0 and 1.";
+  # the next formula is based on the one for mix_up_iter above.
+  realign_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters $realign_time) || exit 1;
+  realign_this_iter[$realign_iter]=$realign_time
+done
+
+cur_egs_dir=$egs_dir
+
+while [ $x -lt $num_iters ]; do
+  [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0;
+
+  this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")
+
+  ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
+  this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");
+
+  echo "On iteration $x, learning rate is $this_learning_rate."
+
+  if [ ! -z "${realign_this_iter[$x]}" ]; then
+    prev_egs_dir=$cur_egs_dir
+    cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
+  fi
+
+  if [ $x -ge 0 ] && [ $stage -le $x ]; then
+    if [ ! -z "${realign_this_iter[$x]}" ]; then
+      time=${realign_this_iter[$x]}
+
+      echo "Getting average posterior for purposes of adjusting the priors."
+      # Note: this just uses CPUs, using a smallish subset of data.
+      # always use the first egs archive, which makes the script simpler;
+      # we're using different random subsets of it.
+      rm $dir/post.$x.*.vec 2>/dev/null
+      $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
+        nnet3-copy-egs --srand=JOB --frame=random $context_opts ark:$prev_egs_dir/egs.1.ark ark:- \| \
+        nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
+        nnet3-merge-egs ark:- ark:- \| \
+        nnet3-compute-from-egs --apply-exp=true "nnet3-am-copy --raw=true $dir/$x.mdl -|" ark:- ark:- \| \
+        matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
+
+      sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.
+
+      $cmd $dir/log/vector_sum.$x.log \
+        vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
+      rm $dir/post.$x.*.vec;
+
+      echo "Re-adjusting priors based on computed posteriors"
+      $cmd $dir/log/adjust_priors.$x.log \
+        nnet3-am-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1;
+
+      sleep 2
+
+      steps/nnet3/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \
+        --transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \
+        --iter $x $data $lang $dir $dir/ali_$time || exit 1
+
+      steps/nnet3/relabel_egs.sh --cmd "$cmd" --iter $x $dir/ali_$time \
+        $prev_egs_dir $cur_egs_dir || exit 1
+
+      if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then
+        steps/nnet3/remove_egs.sh $prev_egs_dir
+      fi
+    fi
+
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    $cmd $dir/log/compute_prob_valid.$x.log \
+      nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
+            "ark:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" &
+    $cmd $dir/log/compute_prob_train.$x.log \
+      nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
+           "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" &
+
+    if [ $x -gt 0 ]; then
+      $cmd $dir/log/progress.$x.log \
+        nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
+        "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:-|" '&&' \
+        nnet3-info "nnet3-am-copy --raw=true $dir/$x.mdl - |" &
+    fi
+
+    echo "Training neural net (pass $x)"
+
+    if [ $x -gt 0 ] && \
+      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
+      [ $[$x%$add_layers_period] -eq 0 ]; then
+      do_average=false # if we've just mixed up, don't do averaging but take the
+                       # best.
+      cur_num_hidden_layers=$[1+$x/$add_layers_period]
+      config=$dir/configs/layer$cur_num_hidden_layers.config
+      raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl - | nnet3-init --srand=$x - $config - |"
+    else
+      do_average=true
+      if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
+      raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl -|"
+    fi
+    if $do_average; then
+      this_minibatch_size=$minibatch_size
+    else
+      # on iteration zero or when we just added a layer, use a smaller minibatch
+      # size (and we will later choose the output of just one of the jobs): the
+      # model-averaging isn't always helpful when the model is changing too fast
+      # (i.e. it can worsen the objective function), and the smaller minibatch
+      # size will help to keep the update stable.
+      this_minibatch_size=$[$minibatch_size/2];
+    fi
+
+    rm $dir/.error 2>/dev/null
+
+
+    ( # this sub-shell is so that when we "wait" below,
+      # we only wait for the training jobs that we just spawned,
+      # not the diagnostic jobs that we spawned above.
+
+      # We can't easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      for n in $(seq $this_num_jobs); do
+        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
+                                               # the other indexes from.
+        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
+        frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame
+        # index; this increases more slowly than the archive index because the
+        # same archive with different frame indexes will give similar gradients,
+        # so we want to separate them in time.
+
+        $cmd $train_queue_opt $dir/log/train.$x.$n.log \
+          nnet3-train $parallel_train_opts \
+          --max-param-change=$max_param_change "$raw" \
+          "ark:nnet3-copy-egs --frame=$frame $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \
+          $dir/$[$x+1].$n.raw || touch $dir/.error &
+      done
+      wait
+    )
+    # the error message below is not that informative, but $cmd will
+    # have printed a more specific one.
+    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;
+
+    nnets_list=
+    for n in `seq 1 $this_num_jobs`; do
+      nnets_list="$nnets_list $dir/$[$x+1].$n.raw"
+    done
+
+    if $do_average; then
+      # average the output of the different jobs.
+      $cmd $dir/log/average.$x.log \
+        nnet3-average $nnets_list - \| \
+        nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
+    else
+      # choose the best from the different jobs.
+      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
+          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
+          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
+          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
+      [ -z "$n" ] && echo "Error getting best model" && exit 1;
+      $cmd $dir/log/select.$x.log \
+        nnet3-am-copy --set-raw-nnet=$dir/$[$x+1].$n.raw  $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
+    fi
+
+    rm $nnets_list
+    [ ! -f $dir/$[$x+1].mdl ] && exit 1;
+    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
+       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
+      rm $dir/$[$x-1].mdl
+    fi
+  fi
+  x=$[$x+1]
+  num_archives_processed=$[$num_archives_processed+$this_num_jobs]
+done
+
+
+if [ $stage -le $num_iters ]; then
+  echo "Doing final combination to produce final.mdl"
+
+  # Now do combination.  In the nnet3 setup, the logic
+  # for doing averaging of subsets of the models in the case where
+  # there are too many models to reliably esetimate interpolation
+  # factors (max_models_combine) is moved into the nnet3-combine
+  nnets_list=()
+  for n in $(seq 0 $[num_iters_combine-1]); do
+    iter=$[$first_model_combine+$n]
+    mdl=$dir/$iter.mdl
+    [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
+    nnets_list[$n]="nnet3-am-copy --raw=true $mdl -|";
+  done
+
+  # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU,
+  # as if there are many models it can give out-of-memory error; and we set
+  # num-threads to 8 to speed it up (this isn't ideal...)
+
+  $cmd $combine_queue_opt $dir/log/combine.log \
+    nnet3-combine --num-iters=40 \
+       --enforce-sum-to-one=true --enforce-positive-weights=true \
+       --verbose=3 "${nnets_list[@]}" "ark:nnet3-merge-egs --minibatch-size=1024 ark:$cur_egs_dir/combine.egs ark:-|" \
+    "|nnet3-am-copy --set-raw-nnet=- $dir/$num_iters.mdl $dir/combined.mdl" || exit 1;
+
+  # Compute the probability of the final, combined model with
+  # the same subset we used for the previous compute_probs, as the
+  # different subsets will lead to different probs.
+  $cmd $dir/log/compute_prob_valid.final.log \
+    nnet3-compute-prob "nnet3-am-copy --raw=true $dir/combined.mdl -|" \
+    "ark:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" &
+  $cmd $dir/log/compute_prob_train.final.log \
+    nnet3-compute-prob  "nnet3-am-copy --raw=true $dir/combined.mdl -|" \
+    "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" &
+fi
+
+if [ $stage -le $[$num_iters+1] ]; then
+  echo "Getting average posterior for purposes of adjusting the priors."
+  # Note: this just uses CPUs, using a smallish subset of data.
+  if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1;
+  else egs_part=JOB; fi
+  rm $dir/post.$x.*.vec 2>/dev/null
+  $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$x.JOB.log \
+    nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$cur_egs_dir/egs.$egs_part.ark ark:- \| \
+    nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
+    nnet3-merge-egs ark:- ark:- \| \
+    nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \
+      "nnet3-am-copy --raw=true $dir/combined.mdl -|" ark:- ark:- \| \
+    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
+
+  sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.
+
+  $cmd $dir/log/vector_sum.$x.log \
+   vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
+
+  rm $dir/post.$x.*.vec;
+
+  echo "Re-adjusting priors based on computed posteriors"
+  $cmd $dir/log/adjust_priors.final.log \
+    nnet3-am-adjust-priors $dir/combined.mdl $dir/post.$x.vec $dir/final.mdl || exit 1;
+fi
+
+
+if [ ! -f $dir/final.mdl ]; then
+  echo "$0: $dir/final.mdl does not exist."
+  # we don't want to clean up if the training didn't succeed.
+  exit 1;
+fi
+
+sleep 2
+
+echo Done
+
+if $cleanup; then
+  echo Cleaning up data
+  if $remove_egs && [[ $cur_egs_dir =~ $dir/egs* ]]; then
+    steps/nnet2/remove_egs.sh $cur_egs_dir
+  fi
+
+  echo Removing most of the models
+  for x in `seq 0 $num_iters`; do
+    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
+       # delete all but every 100th model; don't delete the ones which combine to form the final model.
+      rm $dir/$x.mdl
+    fi
+  done
+fi