From b4c7ab60e925372b9639d27b51e3cb84088b8588 Mon Sep 17 00:00:00 2001
From: yfliao <yfliao@users.noreply.github.com>
Date: Sat, 16 Mar 2019 23:25:45 +0800
Subject: [PATCH] [egs] Add "formosa_speech" recipe (Taiwanese Mandarin ASR)
 (#2474)

---
 egs/formosa/README.txt                        |  22 ++
 egs/formosa/s5/RESULTS                        |  43 ++++
 egs/formosa/s5/cmd.sh                         |  27 +++
 egs/formosa/s5/conf/decode.config             |   5 +
 egs/formosa/s5/conf/mfcc.conf                 |   2 +
 egs/formosa/s5/conf/mfcc_hires.conf           |  10 +
 egs/formosa/s5/conf/online_cmvn.conf          |   1 +
 egs/formosa/s5/conf/pitch.conf                |   1 +
 egs/formosa/s5/local/chain/run_tdnn.sh        |   1 +
 .../s5/local/chain/tuning/run_tdnn_1a.sh      | 181 +++++++++++++++
 .../s5/local/chain/tuning/run_tdnn_1b.sh      | 188 +++++++++++++++
 .../s5/local/chain/tuning/run_tdnn_1c.sh      | 191 +++++++++++++++
 .../s5/local/chain/tuning/run_tdnn_1d.sh      | 190 +++++++++++++++
 .../s5/local/nnet3/run_ivector_common.sh      | 145 ++++++++++++
 egs/formosa/s5/local/nnet3/run_tdnn.sh        | 113 +++++++++
 egs/formosa/s5/local/prepare_data.sh          |  60 +++++
 egs/formosa/s5/local/prepare_dict.sh          |  55 +++++
 egs/formosa/s5/local/prepare_lm.sh            |  42 ++++
 .../s5/local/run_cleanup_segmentation.sh      |  66 ++++++
 egs/formosa/s5/local/score.sh                 |   8 +
 egs/formosa/s5/local/train_lms.sh             |  63 +++++
 egs/formosa/s5/local/wer_hyp_filter           |  19 ++
 egs/formosa/s5/local/wer_output_filter        |  25 ++
 egs/formosa/s5/local/wer_ref_filter           |  19 ++
 egs/formosa/s5/path.sh                        |   6 +
 egs/formosa/s5/run.sh                         | 217 ++++++++++++++++++
 egs/formosa/s5/steps                          |   1 +
 egs/formosa/s5/utils                          |   1 +
 28 files changed, 1702 insertions(+)
 create mode 100644 egs/formosa/README.txt
 create mode 100644 egs/formosa/s5/RESULTS
 create mode 100755 egs/formosa/s5/cmd.sh
 create mode 100644 egs/formosa/s5/conf/decode.config
 create mode 100644 egs/formosa/s5/conf/mfcc.conf
 create mode 100644 egs/formosa/s5/conf/mfcc_hires.conf
 create mode 100644 egs/formosa/s5/conf/online_cmvn.conf
 create mode 100644 egs/formosa/s5/conf/pitch.conf
 create mode 120000 egs/formosa/s5/local/chain/run_tdnn.sh
 create mode 100755 egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh
 create mode 100755 egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh
 create mode 100755 egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh
 create mode 100755 egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh
 create mode 100755 egs/formosa/s5/local/nnet3/run_ivector_common.sh
 create mode 100755 egs/formosa/s5/local/nnet3/run_tdnn.sh
 create mode 100755 egs/formosa/s5/local/prepare_data.sh
 create mode 100755 egs/formosa/s5/local/prepare_dict.sh
 create mode 100755 egs/formosa/s5/local/prepare_lm.sh
 create mode 100755 egs/formosa/s5/local/run_cleanup_segmentation.sh
 create mode 100755 egs/formosa/s5/local/score.sh
 create mode 100755 egs/formosa/s5/local/train_lms.sh
 create mode 100755 egs/formosa/s5/local/wer_hyp_filter
 create mode 100755 egs/formosa/s5/local/wer_output_filter
 create mode 100755 egs/formosa/s5/local/wer_ref_filter
 create mode 100755 egs/formosa/s5/path.sh
 create mode 100755 egs/formosa/s5/run.sh
 create mode 120000 egs/formosa/s5/steps
 create mode 120000 egs/formosa/s5/utils

diff --git a/egs/formosa/README.txt b/egs/formosa/README.txt
new file mode 100644
index 00000000000..3b9d78dad92
--- /dev/null
+++ b/egs/formosa/README.txt
@@ -0,0 +1,22 @@
+### Welcome to the demo recipe of the Formosa Speech in the Wild (FSW) Project ###
+
+The language habits of Taiwanese people are different from other Mandarin speakers (both accents and cultures) [1]. Especially Tainwaese use tranditional Chinese characters, i.e., 繁體中文). To address this issue, a Taiwanese speech corpus collection project "Formosa Speech in the Wild (FSW)" was initiated in 2017 to improve the development of Taiwanese-specific speech recognition techniques.
+
+FSW corpus will be a large-scale database of real-Life/multi-gene Taiwanese Spontaneous speech collected and transcribed from various sources (radio, TV, open courses, etc.). To demostrate that this database is a reasonable data resource for Taiwanese spontaneous speech recognition research, a baseline recipe is provied here for everybody, especially students, to develop their own systems easily and quickly.
+
+This recipe is based on the "NER-Trs-Vol1" corpus (about 150 hours broadcast radio speech selected from FSW). For more details, please visit: 
+* Formosa Speech in the Wild (FSW) project (https://sites.google.com/speech.ntut.edu.tw/fsw)
+
+If you want to apply the NER-Trs-Vol1 corpus, please contact Yuan-Fu Liao (廖元甫) via "yfliao@mail.ntut.edu.tw". This corpus is only for non-commercial research/education use and will be distributed via our GitLab server in https://speech.nchc.org.tw.
+
+Any bug, errors, comments or suggestions are very welcomed.
+
+Yuan-Fu Liao (廖元甫)
+Associate Professor
+Department of electronic Engineering,
+National Taipei University of Technology
+http://www.ntut.edu.tw/~yfliao
+yfliao@mail.ntut.edu.tw
+
+............
+[1] The languages of Taiwan consist of several varieties of languages under families of the Austronesian languages and the Sino-Tibetan languages. Taiwanese Mandarin, Hokkien, Hakka and Formosan languages are used by 83.5%, 81.9%, 6.6% and 1.4% of the population respectively (2010). Given the prevalent use of Taiwanese Hokkien, the Mandarin spoken in Taiwan has been to a great extent influenced by it.
diff --git a/egs/formosa/s5/RESULTS b/egs/formosa/s5/RESULTS
new file mode 100644
index 00000000000..b047e5cefe4
--- /dev/null
+++ b/egs/formosa/s5/RESULTS
@@ -0,0 +1,43 @@
+#
+# Reference results
+#
+# Experimental settings:
+#
+# training set:	show CS, BG, DA, QG, SR, SY and WK,	in total 18977 utt., 1,088,948 words
+# test set:	show JZ, GJ, KX and YX,			in total  2112 utt.,   135,972 words
+# eval set:     show JX, TD and WJ,                     in total  2222 utt.,   104,648 words
+#
+# lexicon: 274,036 words
+# phones (IPA):  196 (tonal)
+#
+
+# WER: test
+
+%WER 61.32 [ 83373 / 135972, 5458 ins, 19156 del, 58759 sub ] exp/mono/decode_test/wer_11_0.0
+%WER 41.00 [ 55742 / 135972, 6725 ins, 12763 del, 36254 sub ] exp/tri1/decode_test/wer_15_0.0
+%WER 40.41 [ 54948 / 135972, 7366 ins, 11505 del, 36077 sub ] exp/tri2/decode_test/wer_14_0.0
+%WER 38.67 [ 52574 / 135972, 6855 ins, 11250 del, 34469 sub ] exp/tri3a/decode_test/wer_15_0.0
+%WER 35.70 [ 48546 / 135972, 7197 ins,  9717 del, 31632 sub ] exp/tri4a/decode_test/wer_17_0.0
+%WER 32.11 [ 43661 / 135972, 6112 ins, 10185 del, 27364 sub ] exp/tri5a/decode_test/wer_17_0.5
+%WER 31.36 [ 42639 / 135972, 6846 ins, 8860 del, 26933 sub ] exp/tri5a_cleaned/decode_test/wer_17_0.5
+%WER 24.43 [ 33218 / 135972, 5524 ins,  7583 del, 20111 sub ] exp/nnet3/tdnn_sp/decode_test/wer_12_0.0
+%WER 23.95 [ 32568 / 135972, 4457 ins, 10271 del, 17840 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_10_0.0
+%WER 23.54 [ 32006 / 135972, 4717 ins, 8644 del, 18645 sub ] exp/chain/tdnn_1b_sp/decode_test/wer_10_0.0
+%WER 20.64 [ 28067 / 135972, 4434 ins, 7946 del, 15687 sub ] exp/chain/tdnn_1c_sp/decode_test/wer_11_0.0
+%WER 20.98 [ 28527 / 135972, 4706 ins, 7816 del, 16005 sub ] exp/chain/tdnn_1d_sp/decode_test/wer_10_0.0
+
+# CER: test
+
+%WER 54.09 [ 116688 / 215718, 4747 ins, 24510 del, 87431 sub ] exp/mono/decode_test/cer_10_0.0
+%WER 32.61 [  70336 / 215718, 5866 ins, 16282 del, 48188 sub ] exp/tri1/decode_test/cer_13_0.0
+%WER 32.10 [  69238 / 215718, 6186 ins, 15772 del, 47280 sub ] exp/tri2/decode_test/cer_13_0.0
+%WER 30.40 [  65583 / 215718, 6729 ins, 13115 del, 45739 sub ] exp/tri3a/decode_test/cer_12_0.0
+%WER 27.53 [  59389 / 215718, 6311 ins, 13008 del, 40070 sub ] exp/tri4a/decode_test/cer_15_0.0
+%WER 24.21 [  52232 / 215718, 6425 ins, 11543 del, 34264 sub ] exp/tri5a/decode_test/cer_15_0.0
+%WER 23.41 [ 50492 / 215718, 6645 ins, 10997 del, 32850 sub ] exp/tri5a_cleaned/decode_test/cer_17_0.0
+%WER 17.07 [  36829 / 215718, 4734 ins,  9938 del, 22157 sub ] exp/nnet3/tdnn_sp/decode_test/cer_12_0.0
+%WER 16.83 [ 36305 / 215718, 4772 ins, 10810 del, 20723 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_9_0.0
+%WER 16.44 [ 35459 / 215718, 4216 ins, 11278 del, 19965 sub ] exp/chain/tdnn_1b_sp/decode_test/cer_10_0.0
+%WER 13.72 [ 29605 / 215718, 4678 ins, 8066 del, 16861 sub ] exp/chain/tdnn_1c_sp/decode_test/cer_10_0.0
+%WER 14.08 [ 30364 / 215718, 5182 ins, 7588 del, 17594 sub ] exp/chain/tdnn_1d_sp/decode_test/cer_9_0.0
+
diff --git a/egs/formosa/s5/cmd.sh b/egs/formosa/s5/cmd.sh
new file mode 100755
index 00000000000..66ae9090820
--- /dev/null
+++ b/egs/formosa/s5/cmd.sh
@@ -0,0 +1,27 @@
+# "queue.pl" uses qsub.  The options to it are
+# options to qsub.  If you have GridEngine installed,
+# change this to a queue you have access to.
+# Otherwise, use "run.pl", which will run jobs locally
+# (make sure your --num-jobs options are no more than
+# the number of cpus on your machine.
+
+# Run locally:
+#export train_cmd=run.pl
+#export decode_cmd=run.pl
+
+# JHU cluster (or most clusters using GridEngine, with a suitable
+# conf/queue.conf).
+export train_cmd="queue.pl"
+export decode_cmd="queue.pl --mem 4G"
+
+host=$(hostname -f)
+if [ ${host#*.} == "fit.vutbr.cz" ]; then
+  queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf,
+  export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2"
+  export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1"
+  export cuda_cmd="queue.pl --config $queue_conf --gpu 1 --mem 10G --tmp 40G"
+elif [ ${host#*.} == "cm.cluster" ]; then
+  # MARCC bluecrab cluster:
+  export train_cmd="slurm.pl --time 4:00:00 "
+  export decode_cmd="slurm.pl --mem 4G --time 4:00:00 "
+fi
diff --git a/egs/formosa/s5/conf/decode.config b/egs/formosa/s5/conf/decode.config
new file mode 100644
index 00000000000..d91f86183af
--- /dev/null
+++ b/egs/formosa/s5/conf/decode.config
@@ -0,0 +1,5 @@
+beam=11.0 # beam for decoding.  Was 13.0 in the scripts.
+first_beam=8.0 # beam for 1st-pass decoding in SAT.
+
+
+
diff --git a/egs/formosa/s5/conf/mfcc.conf b/egs/formosa/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..a1aa3d6c158
--- /dev/null
+++ b/egs/formosa/s5/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false   # only non-default option.
+--sample-frequency=16000
diff --git a/egs/formosa/s5/conf/mfcc_hires.conf b/egs/formosa/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..ca067e77b37
--- /dev/null
+++ b/egs/formosa/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 8000 (=3800)
diff --git a/egs/formosa/s5/conf/online_cmvn.conf b/egs/formosa/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..591367e7ae9
--- /dev/null
+++ b/egs/formosa/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used when invoking online2-wav-nnet3-latgen-faster.
diff --git a/egs/formosa/s5/conf/pitch.conf b/egs/formosa/s5/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs/formosa/s5/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/formosa/s5/local/chain/run_tdnn.sh b/egs/formosa/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..e1adaa9346d
--- /dev/null
+++ b/egs/formosa/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1d.sh
\ No newline at end of file
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..d52644a66d1
--- /dev/null
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,181 @@
+#!/bin/bash
+
+# This script is based on run_tdnn_7h.sh in swbd chain recipe.
+
+set -e
+
+# configs for 'chain'
+affix=1a
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn  # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=12
+minibatch_size=128
+frames_per_eg=150,110,90
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+dir=${dir}${affix:+_$affix}_sp
+train_set=train_sp
+ali_dir=exp/tri5a_sp_ali
+treedir=exp/chain/tri6_7d_tree_sp
+lang=data/lang_chain
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_sp_lats
+  rm exp/tri5a_sp_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=625
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_sp_lats \
+    --use-gpu wait \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+  for test_set in test eval; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3/ivectors_$test_set \
+      $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1;
+  done
+  wait;
+fi
+
+exit 0;
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..0134e63bce2
--- /dev/null
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -0,0 +1,188 @@
+#!/bin/bash
+
+# This script shows improvement arising from data cleaning.
+
+# CER:
+# %WER 16.83 [ 36305 / 215718, 4772 ins, 10810 del, 20723 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_9_0.0
+# %WER 16.44 [ 35459 / 215718, 4216 ins, 11278 del, 19965 sub ] exp/chain/tdnn_1b_sp/decode_test/cer_10_0.0
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1b_sp
+# exp/chain/tdnn_1b_sp: num-iters=133 nj=2..12 num-params=12.5M dim=43+100->4528 combine=-0.073->-0.073 (over 2) xent:train/valid[87,132,final]=(-1.05,-0.964,-0.963/-1.10,-1.06,-1.05) logprob:train/valid[87,132,final]=(-0.079,-0.065,-0.065/-0.094,-0.092,-0.092)
+
+set -e
+
+# configs for 'chain'
+affix=1b
+nnet3_affix=_1b
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn  # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=12
+minibatch_size=128
+frames_per_eg=150,110,90
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+dir=${dir}${affix:+_$affix}_sp
+train_set=train_cleaned_sp
+ali_dir=exp/tri5a_cleaned_sp_ali
+treedir=exp/chain/tri6a_cleaned_tree_sp
+lang=data/lang_chain
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage --train-set train_cleaned --gmm tri5a_cleaned --nnet3-affix $nnet3_affix
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_sp_lats
+  rm exp/tri5a_sp_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=625
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_sp_lats \
+    --use-gpu wait \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+  for test_set in test eval; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$test_set \
+      $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1;
+  done
+  wait;
+fi
+exit 0;
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh
new file mode 100755
index 00000000000..36ea128fdde
--- /dev/null
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh
@@ -0,0 +1,191 @@
+#!/bin/bash
+
+# CER:
+# %WER 16.44 [ 35459 / 215718, 4216 ins, 11278 del, 19965 sub ] exp/chain/tdnn_1b_sp/decode_test/cer_10_0.0
+# %WER 13.72 [ 29605 / 215718, 4678 ins, 8066 del, 16861 sub ] exp/chain/tdnn_1c_sp/decode_test/cer_10_0.0
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1c_sp
+# exp/chain/tdnn_1c_sp: num-iters=147 nj=3..16 num-params=17.9M dim=43+100->4528 combine=-0.041->-0.041 (over 2) xent:train/valid[97,146,final]=(-0.845,-0.625,-0.618/-0.901,-0.710,-0.703) logprob:train/valid[97,146,final]=(-0.064,-0.040,-0.039/-0.072,-0.058,-0.057)
+
+set -e
+
+# configs for 'chain'
+affix=1c
+nnet3_affix=_1b
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn  # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=6
+initial_effective_lrate=0.00025
+final_effective_lrate=0.000025
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=64
+frames_per_eg=150,110,90
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+dir=${dir}${affix:+_$affix}_sp
+train_set=train_cleaned_sp
+ali_dir=exp/tri5a_cleaned_sp_ali
+treedir=exp/chain/tri6a_cleaned_tree_sp
+lang=data/lang_chain
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage --train-set train_cleaned --gmm tri5a_cleaned --nnet3-affix $nnet3_affix
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_sp_lats
+  rm exp/tri5a_sp_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3$nnet3_affix/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_sp_lats \
+    --use-gpu wait \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+  for test_set in test eval; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3${nnet3_affix:+_$nnet3_affix}/ivectors_$test_set \
+      $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1;
+  done
+  wait;
+fi
+
+exit 0;
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh
new file mode 100755
index 00000000000..be21f2402a9
--- /dev/null
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+
+# CER:
+# 1a: %WER 16.83 [ 36305 / 215718, 4772 ins, 10810 del, 20723 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_9_0.0
+# 1d: %WER 14.08 [ 30364 / 215718, 5182 ins, 7588 del, 17594 sub ] exp/chain/tdnn_1d_sp/decode_test/cer_9_0.0
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1d_sp
+# exp/chain/tdnn_1d_sp: num-iters=157 nj=3..16 num-params=18.6M dim=43+100->5792 combine=-0.050->-0.050 (over 1) xent:train/valid[103,156,final]=(-0.977,-0.735,-0.725/-0.953,-0.772,-0.768) logprob:train/valid[103,156,final]=(-0.077,-0.052,-0.052/-0.079,-0.065,-0.066)
+
+set -e
+
+# configs for 'chain'
+affix=1d
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn  # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=6
+initial_effective_lrate=0.00025
+final_effective_lrate=0.000025
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=64
+frames_per_eg=150,110,90
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+dir=${dir}${affix:+_$affix}_sp
+train_set=train_sp
+ali_dir=exp/tri5a_sp_ali
+treedir=exp/chain/tri6a_tree_sp
+lang=data/lang_chain
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage --train-set train --gmm tri5a ${nnet3_affix:+ --nnet3-affix $nnet3_affix}
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_sp_lats
+  rm exp/tri5a_sp_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3$nnet3_affix/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_sp_lats \
+    --use-gpu wait \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+  for test_set in test eval; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3${nnet3_affix:+_$nnet3_affix}/ivectors_$test_set \
+      $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1;
+  done
+  wait;
+fi
+
+exit 0;
diff --git a/egs/formosa/s5/local/nnet3/run_ivector_common.sh b/egs/formosa/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..723589ddd2e
--- /dev/null
+++ b/egs/formosa/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is modified based on mini_librispeech/s5/local/nnet3/run_ivector_common.sh
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train
+test_sets="test eval"
+gmm=tri5a
+
+nnet3_affix=
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_sp_ali
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj 70 data/${train_set}_sp \
+    exp/make_mfcc/${train_set}_sp mfcc_perturbed || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp \
+    exp/make_mfcc/${train_set}_sp mfcc_perturbed || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=mfcc_perturbed_hires
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc_pitch.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+    # create MFCC data dir without pitch to extract iVector
+    utils/data/limit_feature_dim.sh 0:39 data/${datadir}_hires data/${datadir}_hires_nopitch || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires_nopitch exp/make_hires/$datadir $mfccdir || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  # We'll use about a quarter of the data.
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires_nopitch/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires_nopitch \
+     $num_utts ${temp_data_root}/${train_set}_sp_hires_nopitch_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_nopitch_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    ${temp_data_root}/${train_set}_sp_hires_nopitch_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+     data/${train_set}_sp_hires_nopitch exp/nnet3${nnet3_affix}/diag_ubm \
+     exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires_nopitch ${temp_data_root}/${train_set}_sp_hires_nopitch_max2
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+    ${temp_data_root}/${train_set}_sp_hires_nopitch_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 8 \
+      data/${data}_hires_nopitch exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}
+  done
+fi
+
+exit 0
diff --git a/egs/formosa/s5/local/nnet3/run_tdnn.sh b/egs/formosa/s5/local/nnet3/run_tdnn.sh
new file mode 100755
index 00000000000..a41d990a9b2
--- /dev/null
+++ b/egs/formosa/s5/local/nnet3/run_tdnn.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+# This script is based on swbd/s5c/local/nnet3/run_tdnn.sh
+
+# this is the standard "tdnn" system, built in nnet3; it's what we use to
+# call multi-splice.
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+set -e
+
+stage=0
+train_stage=-10
+affix=
+common_egs_dir=
+
+# training options
+initial_effective_lrate=0.0015
+final_effective_lrate=0.00015
+num_epochs=4
+num_jobs_initial=2
+num_jobs_final=8
+remove_egs=false
+
+# feature options
+use_ivectors=true
+
+# End configuration section.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+dir=exp/nnet3/tdnn_sp${affix:+_$affix}
+gmm_dir=exp/tri5a
+train_set=train_sp
+ali_dir=${gmm_dir}_sp_ali
+graph_dir=$gmm_dir/graph
+
+local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating neural net configs";
+
+  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=850
+  relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2)
+  relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2)
+  relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn6 dim=850
+  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 8 ]; then
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 500 \
+    --use-gpu wait \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 9 ]; then
+  # this version of the decoding treats each utterance separately
+  # without carrying forward speaker information.
+
+  for decode_set in test eval; do
+    num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    decode_dir=${dir}/decode_$decode_set
+    steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+       --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+       $graph_dir data/${decode_set}_hires $decode_dir || exit 1;
+  done
+  wait;
+fi
+
+exit 0;
diff --git a/egs/formosa/s5/local/prepare_data.sh b/egs/formosa/s5/local/prepare_data.sh
new file mode 100755
index 00000000000..68f342e1549
--- /dev/null
+++ b/egs/formosa/s5/local/prepare_data.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Copyright 2015-2016  Sarah Flora Juan
+# Copyright 2016  Johns Hopkins University (Author: Yenda Trmal)
+# Copyright 2018  Yuan-Fu Liao, National Taipei University of Technology
+#                 AsusTek Computer Inc. (Author: Alex Hung)
+
+# Apache 2.0
+
+set -e -o pipefail
+
+train_dir=NER-Trs-Vol1/Train
+eval_dir=NER-Trs-Vol1-Eval
+eval_key_dir=NER-Trs-Vol1-Eval-Key
+
+. ./path.sh
+. parse_options.sh
+
+for x in $train_dir $eval_dir; do
+  if [ ! -d "$x" ] ; then
+    echo >&2 "The directory $x does not exist"
+  fi
+done
+
+if [ -z "$(command -v dos2unix 2>/dev/null)" ]; then
+    echo "dos2unix not found on PATH. Please install it manually."
+    exit 1;
+fi
+
+# have to remvoe previous files to avoid filtering speakers according to cmvn.scp and feats.scp
+rm -rf   data/all data/train data/test data/eval data/local/train
+mkdir -p data/all data/train data/test data/eval data/local/train
+
+
+# make utt2spk, wav.scp and text
+find $train_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n"     $y $y' \; | dos2unix > data/all/utt2spk
+find $train_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n"     $y $x' \; | dos2unix > data/all/wav.scp
+find $train_dir -name *.txt -exec sh -c 'x={}; y=$(basename -s .txt $x); printf "%s " $y; cat $x'    \; | dos2unix > data/all/text
+
+# fix_data_dir.sh fixes common mistakes (unsorted entries in wav.scp,
+# duplicate entries and so on). Also, it regenerates the spk2utt from
+# utt2spk
+utils/fix_data_dir.sh data/all
+
+echo "Preparing train and test data"
+# test set: JZ, GJ, KX, YX
+grep -E "(JZ|GJ|KX|YX)_" data/all/utt2spk | awk '{print $1}' > data/all/cv.spk
+utils/subset_data_dir_tr_cv.sh --cv-spk-list data/all/cv.spk data/all data/train data/test
+
+# for LM training
+echo "cp data/train/text data/local/train/text for language model training"
+cat data/train/text | awk '{$1=""}1;' | awk '{$1=$1}1;' > data/local/train/text
+
+# preparing EVAL set.
+find $eval_dir     -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n"     $y $y' \; | dos2unix > data/eval/utt2spk
+find $eval_dir     -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n"     $y $x' \; | dos2unix > data/eval/wav.scp
+find $eval_key_dir -name *.txt -exec sh -c 'x={}; y=$(basename -s .txt $x); printf "%s " $y; cat $x'    \; | dos2unix > data/eval/text
+utils/fix_data_dir.sh data/eval
+
+echo "Data preparation completed."
+exit 0;
diff --git a/egs/formosa/s5/local/prepare_dict.sh b/egs/formosa/s5/local/prepare_dict.sh
new file mode 100755
index 00000000000..4e580f5f6e8
--- /dev/null
+++ b/egs/formosa/s5/local/prepare_dict.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Copyright 2015-2016  Sarah Flora Juan
+# Copyright 2016  Johns Hopkins University (Author: Yenda Trmal)
+# Copyright 2018  Yuan-Fu Liao, National Taipei University of Technology
+# Apache 2.0
+
+source_dir=NER-Trs-Vol1/Language
+dict_dir=data/local/dict
+rm -rf $dict_dir
+mkdir -p $dict_dir
+
+#
+#
+#
+rm -f $dict_dir/lexicon.txt
+touch $dict_dir/lexicon.txt
+cat $source_dir/lexicon.txt > $dict_dir/lexicon.txt
+echo "<SIL> SIL"	>> $dict_dir/lexicon.txt
+
+#
+# define silence phone
+#
+rm -f $dict_dir/silence_phones.txt
+touch $dict_dir/silence_phones.txt
+
+echo "SIL"	> $dict_dir/silence_phones.txt
+
+#
+# find nonsilence phones
+#
+rm -f $dict_dir/nonsilence_phones.txt
+touch $dict_dir/nonsilence_phones.txt
+
+cat $source_dir/lexicon.txt | grep -v -F -f $dict_dir/silence_phones.txt | \
+    perl -ane 'print join("\n", @F[1..$#F]) . "\n"; '  | \
+    sort -u > $dict_dir/nonsilence_phones.txt
+
+#
+# add optional silence phones
+#
+
+rm -f $dict_dir/optional_silence.txt
+touch $dict_dir/optional_silence.txt
+echo "SIL"	> $dict_dir/optional_silence.txt
+
+#
+# extra questions
+#
+rm -f $dict_dir/extra_questions.txt
+touch $dict_dir/extra_questions.txt
+cat $dict_dir/silence_phones.txt    | awk '{printf("%s ", $1);} END{printf "\n";}'  > $dict_dir/extra_questions.txt || exit 1;
+cat $dict_dir/nonsilence_phones.txt | awk '{printf("%s ", $1);} END{printf "\n";}' >> $dict_dir/extra_questions.txt || exit 1;
+
+echo "Dictionary preparation succeeded"
+exit 0;
diff --git a/egs/formosa/s5/local/prepare_lm.sh b/egs/formosa/s5/local/prepare_lm.sh
new file mode 100755
index 00000000000..59fe1529658
--- /dev/null
+++ b/egs/formosa/s5/local/prepare_lm.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Copyright 2015-2016  Sarah Flora Juan
+# Copyright 2016  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+
+set -e -o pipefail
+
+# To create G.fst from ARPA language model
+. ./path.sh || die "path.sh expected";
+
+local/train_lms_srilm.sh --train-text data/train/text data/ data/srilm
+
+#nl -nrz -w10  corpus/LM/iban-bp-2012.txt | utils/shuffle_list.pl > data/local/external_text
+local/train_lms_srilm.sh --train-text data/local/external_text data/ data/srilm_external
+
+# let's do ngram interpolation of the previous two LMs
+# the lm.gz is always symlink to the model with the best perplexity, so we use that
+
+mkdir -p data/srilm_interp
+for w in 0.9 0.8 0.7 0.6 0.5; do
+    ngram -lm data/srilm/lm.gz  -mix-lm data/srilm_external/lm.gz \
+          -lambda $w -write-lm data/srilm_interp/lm.${w}.gz
+    echo -n "data/srilm_interp/lm.${w}.gz "
+    ngram -lm data/srilm_interp/lm.${w}.gz -ppl data/srilm/dev.txt | paste -s -
+done | sort  -k15,15g  > data/srilm_interp/perplexities.txt
+
+# for basic decoding, let's use only a trigram LM
+[ -d data/lang_test/ ] && rm -rf data/lang_test
+cp -R data/lang data/lang_test
+lm=$(cat data/srilm/perplexities.txt | grep 3gram | head -n1 | awk '{print $1}')
+local/arpa2G.sh $lm data/lang_test data/lang_test
+
+# for decoding using bigger LM let's find which interpolated gave the most improvement
+[ -d data/lang_big ] && rm -rf data/lang_big
+cp -R data/lang data/lang_big
+lm=$(cat data/srilm_interp/perplexities.txt | head -n1 | awk '{print $1}')
+local/arpa2G.sh $lm data/lang_big data/lang_big
+
+# for really big lm, we should only decode using small LM
+# and resocre using the big lm
+utils/build_const_arpa_lm.sh $lm data/lang_big data/lang_big
+exit 0;
diff --git a/egs/formosa/s5/local/run_cleanup_segmentation.sh b/egs/formosa/s5/local/run_cleanup_segmentation.sh
new file mode 100755
index 00000000000..b72cd89b4d1
--- /dev/null
+++ b/egs/formosa/s5/local/run_cleanup_segmentation.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Copyright   2016  Vimal Manohar
+#             2016  Johns Hopkins University (author: Daniel Povey)
+#             2017  Nagendra Kumar Goel
+#             2019  AsusTek Computer Inc. (author: Alex Hung)
+# Apache 2.0
+
+# This script demonstrates how to re-segment training data selecting only the
+# "good" audio that matches the transcripts.
+# The basic idea is to decode with an existing in-domain acoustic model, and a
+# biased language model built from the reference, and then work out the
+# segmentation from a ctm like file.
+
+# For nnet3 and chain results after cleanup, see the scripts in
+# local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
+
+# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets
+# [will add these later].
+
+set -e
+set -o pipefail
+set -u
+
+stage=0
+cleanup_stage=0
+data=data/train
+cleanup_affix=cleaned
+srcdir=exp/tri5a
+langdir=data/lang_test
+nj=20
+decode_nj=20
+decode_num_threads=1
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. utils/parse_options.sh
+
+cleaned_data=${data}_${cleanup_affix}
+
+dir=${srcdir}_${cleanup_affix}_work
+cleaned_dir=${srcdir}_${cleanup_affix}
+
+if [ $stage -le 1 ]; then
+  # This does the actual data cleanup.
+  steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage \
+    --nj $nj --cmd "$train_cmd" \
+    $data $langdir $srcdir $dir $cleaned_data
+fi
+
+if [ $stage -le 2 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $cleaned_data $langdir $srcdir ${srcdir}_ali_${cleanup_affix}
+fi
+
+if [ $stage -le 3 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" \
+    3500 100000 $cleaned_data $langdir ${srcdir}_ali_${cleanup_affix} ${cleaned_dir}
+fi
+
+utils/data/get_utt2dur.sh data/train_cleaned
+ori_avg_dur=$(awk 'BEGIN{total=0}{total += $2}END{printf("%.2f", total/NR)}' ${data}/utt2dur)
+new_avg_dur=$(awk 'BEGIN{total=0}{total += $2}END{printf("%.2f", total/NR)}' ${cleaned_data}/utt2dur)
+echo "average duration was reduced from ${ori_avg_dur}s to ${new_avg_dur}s."
+# average duration was reduced from 21.68s to 10.97s.
+exit 0;
diff --git a/egs/formosa/s5/local/score.sh b/egs/formosa/s5/local/score.sh
new file mode 100755
index 00000000000..a9786169973
--- /dev/null
+++ b/egs/formosa/s5/local/score.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e -o pipefail
+set -x
+steps/score_kaldi.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
+
+echo "$0: Done"
diff --git a/egs/formosa/s5/local/train_lms.sh b/egs/formosa/s5/local/train_lms.sh
new file mode 100755
index 00000000000..efc5b92c573
--- /dev/null
+++ b/egs/formosa/s5/local/train_lms.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+
+# To be run from one directory above this script.
+. ./path.sh
+
+text=data/local/train/text
+lexicon=data/local/dict/lexicon.txt
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+
+# This script takes no arguments.  It assumes you have already run
+# aishell_data_prep.sh.
+# It takes as input the files
+# data/local/train/text
+# data/local/dict/lexicon.txt
+dir=data/local/lm
+mkdir -p $dir
+
+kaldi_lm=`which train_lm.sh`
+if [ -z $kaldi_lm ]; then
+  echo "$0: train_lm.sh is not found. That might mean it's not installed"
+  echo "$0: or it is not added to PATH"
+  echo "$0: Use the script tools/extra/install_kaldi_lm.sh to install it"
+  exit 1
+fi
+
+cleantext=$dir/text.no_oov
+
+cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
+  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SIL> ");} } printf("\n");}' \
+  > $cleantext || exit 1;
+
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
+   sort -nr > $dir/word.counts || exit 1;
+
+# Get counts from acoustic training transcripts, and add  one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
+  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+
+# note: we probably won't really make use of <SIL> as there aren't any OOVs
+cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<SIL>" > $dir/word_map \
+   || exit 1;
+
+# note: ignore 1st field of train.txt, it's the utterance-id.
+cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
+  { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
+   || exit 1;
+
+train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
+
+# LM is small enough that we don't need to prune it (only about 0.7M N-grams).
+# Perplexity over 128254.000000 words is 90.446690
+
+# note: output is
+# data/local/lm/3gram-mincount/lm_unpruned.gz
+
+exit 0;
diff --git a/egs/formosa/s5/local/wer_hyp_filter b/egs/formosa/s5/local/wer_hyp_filter
new file mode 100755
index 00000000000..519d92ee80d
--- /dev/null
+++ b/egs/formosa/s5/local/wer_hyp_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env perl
+
+@filters=('<SIL>');
+
+foreach $w (@filters) {
+  $bad{$w} = 1;
+}
+
+while(<STDIN>) {
+  @A  = split(" ", $_);
+  $id = shift @A;
+  print "$id ";
+  foreach $a (@A) {
+    if (!defined $bad{$a}) {
+      print "$a ";
+    }
+  }
+  print "\n";
+}
diff --git a/egs/formosa/s5/local/wer_output_filter b/egs/formosa/s5/local/wer_output_filter
new file mode 100755
index 00000000000..06a99a43e34
--- /dev/null
+++ b/egs/formosa/s5/local/wer_output_filter
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+while (<>) {
+  @F = split " ";
+  print $F[0] . " "; 
+  foreach $s (@F[1..$#F]) {
+    if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "<SIL>")) {
+      print "";
+    } else {
+      print "$s"
+    }
+    print " ";
+  }
+  print "\n";
+}
+
+
diff --git a/egs/formosa/s5/local/wer_ref_filter b/egs/formosa/s5/local/wer_ref_filter
new file mode 100755
index 00000000000..519d92ee80d
--- /dev/null
+++ b/egs/formosa/s5/local/wer_ref_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env perl
+
+@filters=('<SIL>');
+
+foreach $w (@filters) {
+  $bad{$w} = 1;
+}
+
+while(<STDIN>) {
+  @A  = split(" ", $_);
+  $id = shift @A;
+  print "$id ";
+  foreach $a (@A) {
+    if (!defined $bad{$a}) {
+      print "$a ";
+    }
+  }
+  print "\n";
+}
diff --git a/egs/formosa/s5/path.sh b/egs/formosa/s5/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/formosa/s5/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/formosa/s5/run.sh b/egs/formosa/s5/run.sh
new file mode 100755
index 00000000000..a4d0f2dcd1d
--- /dev/null
+++ b/egs/formosa/s5/run.sh
@@ -0,0 +1,217 @@
+#!/bin/bash
+#
+# Copyright 2018, Yuan-Fu Liao, National Taipei University of Technology, yfliao@mail.ntut.edu.tw
+#
+# Before you run this recipe, please apply, download and put or make a link of the corpus under this folder (folder name: "NER-Trs-Vol1").
+# For more detail, please check:
+# 1. Formosa Speech in the Wild (FSW) project (https://sites.google.com/speech.ntut.edu.tw/fsw/home/corpus)
+# 2. Formosa Speech Recognition Challenge (FSW) 2018 (https://sites.google.com/speech.ntut.edu.tw/fsw/home/challenge)
+stage=-2
+num_jobs=20
+
+train_dir=NER-Trs-Vol1/Train
+eval_dir=NER-Trs-Vol1-Eval
+eval_key_dir=NER-Trs-Vol1-Eval-Key
+
+# shell options
+set -eo pipefail
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+# configure number of jobs running in parallel, you should adjust these numbers according to your machines
+# data preparation
+if [ $stage -le -2 ]; then
+  # Lexicon Preparation,
+  echo "$0: Lexicon Preparation"
+  local/prepare_dict.sh || exit 1;
+
+  # Data Preparation
+  echo "$0: Data Preparation"
+  local/prepare_data.sh --train-dir $train_dir --eval-dir $eval_dir --eval-key-dir $eval_key_dir || exit 1;
+
+  # Phone Sets, questions, L compilation
+  echo "$0: Phone Sets, questions, L compilation Preparation"
+  rm -rf data/lang
+  utils/prepare_lang.sh --position-dependent-phones false data/local/dict \
+      "<SIL>" data/local/lang data/lang || exit 1;
+
+  # LM training
+  echo "$0: LM training"
+  rm -rf data/local/lm/3gram-mincount
+  local/train_lms.sh || exit 1;
+
+  # G compilation, check LG composition
+  echo "$0: G compilation, check LG composition"
+  utils/format_lm.sh data/lang data/local/lm/3gram-mincount/lm_unpruned.gz \
+      data/local/dict/lexicon.txt data/lang_test || exit 1;
+
+fi
+
+# Now make MFCC plus pitch features.
+# mfccdir should be some place with a largish disk where you
+# want to store MFCC features.
+mfccdir=mfcc
+
+# mfcc
+if [ $stage -le -1 ]; then
+  echo "$0: making mfccs"
+  for x in train test eval; do
+    steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj $num_jobs data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+    utils/fix_data_dir.sh data/$x || exit 1;
+  done
+fi
+
+# mono
+if [ $stage -le 0 ]; then
+  echo "$0: train mono model"
+  # Make some small data subsets for early system-build stages.
+  echo "$0: make training subsets"
+  utils/subset_data_dir.sh --shortest data/train 3000 data/train_mono
+
+  # train mono
+  steps/train_mono.sh --boost-silence 1.25 --cmd "$train_cmd" --nj $num_jobs \
+    data/train_mono data/lang exp/mono || exit 1;
+
+  # Get alignments from monophone system.
+  steps/align_si.sh --boost-silence 1.25 --cmd "$train_cmd" --nj $num_jobs \
+    data/train data/lang exp/mono exp/mono_ali || exit 1;
+
+  # Monophone decoding
+  (
+  utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph || exit 1;
+  steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj $num_jobs \
+    exp/mono/graph data/test exp/mono/decode_test
+  )&
+fi
+
+# tri1
+if [ $stage -le 1 ]; then
+  echo "$0: train tri1 model"
+  # train tri1 [first triphone pass]
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
+   2500 20000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
+
+  # align tri1
+  steps/align_si.sh --cmd "$train_cmd" --nj $num_jobs \
+    data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
+
+  # decode tri1
+  (
+  utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1;
+  steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj $num_jobs \
+    exp/tri1/graph data/test exp/tri1/decode_test
+  )&
+fi
+
+# tri2
+if [ $stage -le 2 ]; then
+  echo "$0: train tri2 model"
+  # train tri2 [delta+delta-deltas]
+  steps/train_deltas.sh --cmd "$train_cmd" \
+   2500 20000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1;
+
+  # align tri2b
+  steps/align_si.sh --cmd "$train_cmd" --nj $num_jobs \
+    data/train data/lang exp/tri2 exp/tri2_ali || exit 1;
+
+  # decode tri2
+  (
+  utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph
+  steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj $num_jobs \
+    exp/tri2/graph data/test exp/tri2/decode_test
+  )&
+fi
+
+# tri3a
+if [ $stage -le 3 ]; then
+  echo "$-: train tri3 model"
+  # Train tri3a, which is LDA+MLLT,
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+   2500 20000 data/train data/lang exp/tri2_ali exp/tri3a || exit 1;
+
+  # decode tri3a
+  (
+  utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
+  steps/decode.sh --cmd "$decode_cmd" --nj $num_jobs --config conf/decode.config \
+    exp/tri3a/graph data/test exp/tri3a/decode_test
+  )&
+fi
+
+# tri4
+if [ $stage -le 4 ]; then
+  echo "$0: train tri4 model"
+  # From now, we start building a more serious system (with SAT), and we'll
+  # do the alignment with fMLLR.
+  steps/align_fmllr.sh --cmd "$train_cmd" --nj $num_jobs \
+    data/train data/lang exp/tri3a exp/tri3a_ali || exit 1;
+
+  steps/train_sat.sh --cmd "$train_cmd" \
+    2500 20000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1;
+
+  # align tri4a
+  steps/align_fmllr.sh  --cmd "$train_cmd" --nj $num_jobs \
+    data/train data/lang exp/tri4a exp/tri4a_ali
+
+  # decode tri4a
+  (
+  utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
+  steps/decode_fmllr.sh --cmd "$decode_cmd" --nj $num_jobs --config conf/decode.config \
+    exp/tri4a/graph data/test exp/tri4a/decode_test
+  )&
+fi
+
+# tri5
+if [ $stage -le 5 ]; then
+  echo "$0: train tri5 model"
+  # Building a larger SAT system.
+  steps/train_sat.sh --cmd "$train_cmd" \
+    3500 100000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1;
+
+  # align tri5a
+  steps/align_fmllr.sh --cmd "$train_cmd" --nj $num_jobs \
+    data/train data/lang exp/tri5a exp/tri5a_ali || exit 1;
+
+  # decode tri5
+  (
+  utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph || exit 1;
+  steps/decode_fmllr.sh --cmd "$decode_cmd" --nj $num_jobs --config conf/decode.config \
+     exp/tri5a/graph data/test exp/tri5a/decode_test || exit 1;
+  )&
+fi
+
+# nnet3 tdnn models
+# commented out by default, since the chain model is usually faster and better
+#if [ $stage -le 6 ]; then
+  # echo "$0: train nnet3 model"
+  # local/nnet3/run_tdnn.sh
+#fi
+
+# chain model
+if [ $stage -le 7 ]; then
+  # The iVector-extraction and feature-dumping parts coulb be skipped by setting "--train_stage 7"
+  echo "$0: train chain model"
+  local/chain/run_tdnn.sh
+fi
+
+# getting results (see RESULTS file)
+if [ $stage -le 8 ]; then
+  echo "$0: extract the results"
+  for test_set in test eval; do
+  echo "WER: $test_set"
+  for x in exp/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null
+  for x in exp/*/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null
+  echo
+
+  echo "CER: $test_set"
+  for x in exp/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null
+  for x in exp/*/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null
+  echo
+  done
+fi
+
+# finish
+echo "$0: all done"
+
+exit 0;
diff --git a/egs/formosa/s5/steps b/egs/formosa/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/formosa/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/formosa/s5/utils b/egs/formosa/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/formosa/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file