Some new functionality in nnet3; including drafts of example scripts

kaldi-asr · Jul 29, 2015 · 53baf84 · 53baf84
1 parent e150228
commit 53baf84
Show file tree

Hide file tree

Showing 10 changed files with 435 additions and 30 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@
 
 # emacs saves
 [#]*[#]
+.[#]*
 *~
 # .depend.mk files
 .depend.mk
@@ -59,6 +60,7 @@
 /tools/CLAPACK_include
 /tools/kaldi_lm
 /tools/env.sh
+/tools/rnnlm-hs-0.1b/rnnlm
 
 # /src/
 /src/kaldi.mk

diff --git a/egs/wsj/s5/local/nnet3/run_ivector_common.sh b/egs/wsj/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+# this script is called from scripts like run_ms.sh; it does the common stages
+# of the build, such as feature extraction.
+# This is actually the same as local/online/run_nnet2_common.sh, except
+# for the directory names.
+
+. cmd.sh
+mfccdir=mfcc
+
+stage=1
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if [ $stage -le 1 ]; then
+  for datadir in train_si284 test_eval93 test_dev93 test_eval92; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+    steps/make_mfcc.sh --nj 40 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+  done
+  utils/subset_data_dir.sh --first data/train_si284_hires 7138 data/train_si84_hires || exit 1
+fi
+
+1
+if [ $stage -le 2 ]; then
+  # We need to build a small system just because we need the LDA+MLLT transform
+  # to train the diag-UBM on top of.  We align the si84 data for this purpose.
+
+  steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+    data/train_si84 data/lang exp/tri4b exp/nnet3/tri4b_ali_si84
+fi
+
+if [ $stage -le 3 ]; then
+  # Train a small system just for its LDA+MLLT transform.  We use --num-iters 13
+  # because after we get the transform (12th iter is the last), any further
+  # training is pointless.
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+    --realign-iters "" \
+    --splice-opts "--left-context=3 --right-context=3" \
+    5000 10000 data/train_si84_hires data/lang \
+     exp/nnet3/tri4b_ali_si84 exp/nnet3/tri5b
+fi
+
+if [ $stage -le 4 ]; then
+  mkdir -p exp/nnet3
+
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+     --num-frames 400000 data/train_si84_hires 256 exp/nnet3/tri5b exp/nnet3/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # even though $nj is just 10, each job uses multiple processes and threads.
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    data/train_si284_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # We extract iVectors on all the train_si284 data, which will be what we
+  # train the system on.
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_si284_hires \
+    data/train_si284_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+    data/train_si284_hires_max2 exp/nnet3/extractor exp/nnet3/ivectors_train_si284 || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  rm exp/nnet3/.error 2>/dev/null
+  for data in test_eval92 test_dev93 test_eval93; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 8 \
+      data/${data}_hires exp/nnet3/extractor exp/nnet3/ivectors_${data} || touch exp/nnet3/.error &
+  done
+  wait
+  [ -f exp/nnet3/.error ] && echo "$0: error extracting iVectors." && exit 1;
+fi
+
+exit 0;
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn.sh b/egs/wsj/s5/local/nnet3/run_tdnn.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# this is the standard "tdnn" system, built in nnet3; it's what we use to
+# call multi-splice.
+
+. cmd.sh
+
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+stage=0
+train_stage=-10
+dir=exp/nnet3/nnet_ms_a
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/online/run_ivector_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 8 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+
+  steps/nnet3/train_tdnn.sh --stage $train_stage \
+    --num-epochs 8 --num-jobs-initial 2 --num-jobs-final 14 \
+    --num-hidden-layers 4 \
+    --splice-indexes "-4,-3,-2,-1,0,1,2,3,4  0  -2,2  0  -4,4 0" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_train_si284 \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --io-opts "-tc 12" \
+    --initial-effective-lrate 0.005 --final-effective-lrate 0.0005 \
+    --cmd "$decode_cmd" \
+    --pnorm-input-dim 2000 \
+    --pnorm-output-dim 250 \
+    --mix-up 12000 \
+    data/train_si284_hires data/lang exp/tri4b_ali_si284 $dir  || exit 1;
+fi
+
+
+if [ $stage -le 9 ]; then
+  # this does offline decoding that should give the same results as the real
+  # online decoding.
+  for lm_suffix in tgpr bd_tgpr; do
+    graph_dir=exp/tri4b/graph_${lm_suffix}
+    # use already-built graphs.
+    for year in eval92 dev93; do
+      steps/nnet3/decode.sh --nj 8 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet2_online/ivectors_test_$year \
+         $graph_dir data/test_${year}_hires $dir/decode_${lm_suffix}_${year} || exit 1;
+    done
+  done
+fi
+
diff --git a/egs/wsj/s5/run.sh b/egs/wsj/s5/run.sh
@@ -108,7 +108,6 @@ for x in test_eval92 test_eval93 test_dev93 train_si284; do
  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
 done
 
-
 utils/subset_data_dir.sh --first data/train_si284 7138 data/train_si84 || exit 1
 
 # Now make subset with the shortest 2k utterances from si-84.

diff --git a/egs/wsj/s5/steps/nnet3/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/train_tdnn.sh
@@ -51,7 +51,7 @@ stage=-6
 exit_stage=-100 # you can set this to terminate the training early.  Exits before running this stage
 
 # count space-separated fields in splice_indexes to get num-hidden-layers.
-splice_indexes="-4,-3,-2,-1,0,1,2,3,4 0 -2,2 0 -4,4 0"
+splice_indexes="-4,-3,-2,-1,0,1,2,3,4  0  -2,2  0  -4,4 0"
 # Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
 # note: hidden layers which are composed of one or more components,
 # so hidden layer indexing is different from component count
@@ -62,7 +62,7 @@ randprune=4.0 # speeds up LDA.
 affine_opts=
 
 gpu=true    # if true, we run on GPU.
-cpu_num_threads=16  # if using CPU, the number of threads we use.
+num_threads=16  # if using CPU, the number of threads we use.
 combine_num_threads=8  # number of threads for the "combine" operation
 cleanup=true
 egs_dir=
@@ -257,7 +257,7 @@ if [ $stage -le -3 ]; then
 
   # Write stats with the same format as stats for LDA.
   $cmd JOB=1:$num_lda_jobs $dir/log/get_lda_stats.JOB.log \
-      nnet3-get-lda-stats --rand-prune=$rand_prune \
+      nnet3-acc-lda-stats --rand-prune=$rand_prune \
         $dir/init.raw $egs_dir/egs.JOB.ark $dir/JOB.lda_stats || exit 1;
 
   all_lda_accs=$(for n in $(seq $num_lda_jobs); do echo $dir/$n.lda_stats; done)
@@ -327,25 +327,25 @@ finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
 
 echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
 
-if [ $num_threads -eq 1 ]; then
+if $gpu; then
   parallel_suffix="-simple" # this enables us to use GPU code if
                             # we have just one thread.
+  train_queue_opt="--gpu 1"
   parallel_train_opts=
-  if !$gpu; then
-    train_gpu_opt="--gpu 1"
-    if ! cuda-compiled; then
-      echo "$0: WARNING: you are running with one thread but you have not compiled"
-      echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
-      echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
-      exit 1
-    fi
-  else
-    echo "$0: WARNING: running with 1 thread and no GPU: this will be slow."
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are running with one thread but you have not compiled"
+    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
+    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
+    exit 1
   fi
 else
-  $gpu && echo "$0: you must use --gpu false if you supply num-threads > 1" && exit 1;
-  parallel_suffix="-parallel"
-  parallel_train_opts="--num-threads=$num_threads"
+  if [ $num_threads -gt 1 ]; then
+    parallel_suffix="-parallel"
+    parallel_train_opts="--num-threads=$num_threads"
+    train_queue_opt="--num-threads $num_threads"
+  else
+    parallel_suffix="-simple"
+  fi
 fi
 
 
@@ -367,7 +367,6 @@ first_model_combine=$[$num_iters-$num_iters_combine+1]
 
 x=0
 
-
 for realign_time in $realign_times; do
   # Work out the iterations on which we will re-align, if the --realign-times
   # option was used.  This is slightly approximate.
@@ -407,7 +406,8 @@ while [ $x -lt $num_iters ]; do
       $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
         nnet3-copy-egs --srand=JOB --frame=random $context_opts ark:$prev_egs_dir/egs.1.ark ark:- \| \
         nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
-        nnet3-compute-from-egs "nnet3-to-raw $dir/$x.mdl -|" ark:- ark:- \| \
+        nnet3-merge-egs ark:- ark:- \| \
+        nnet3-compute-from-egs --apply-exp "nnet3-to-raw $dir/$x.mdl -|" ark:- ark:- \| \
         matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
 
       sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.
@@ -499,7 +499,7 @@ while [ $x -lt $num_iters ]; do
         # same archive with different frame indexes will give similar gradients,
         # so we want to separate them in time.
 
-        $cmd $train_gpu_opt $dir/log/train.$x.$n.log \
+        $cmd $train_queue_opt $dir/log/train.$x.$n.log \
           nnet3-train$parallel_suffix $parallel_train_opts --minibatch-size=$this_minibatch_size --srand=$x "$raw" \
           "ark:nnet3-copy-egs --frame=$frame $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size ark:- ark:- |" \
           $dir/$[$x+1].$n.raw || touch $dir/.error &
@@ -600,7 +600,8 @@ if [ $stage -le $[$num_iters+1] ]; then
   $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
     nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$cur_egs_dir/egs.1.ark ark:- \| \
     nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
-    nnet3-compute-from-egs "nnet3-am-copy --raw=true $dir/final.mdl -|" ark:- ark:- \| \
+    nnet3-merge-egs ark:- ark:- \| \
+    nnet3-compute-from-egs --apply-exp=true "nnet3-am-copy --raw=true $dir/final.mdl -|" ark:- ark:- \| \
     matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
 
   sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
@@ -7,7 +7,8 @@ LDFLAGS += $(CUDA_LDFLAGS)
 LDLIBS += $(CUDA_LDLIBS)
 
 BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
-    nnet3-shuffle-egs nnet3-get-lda-stats
+   nnet3-shuffle-egs nnet3-acc-lda-stats nnet3-merge-egs \
+   nnet3-compute-from-egs
 
 OBJFILES =
 

diff --git a/src/nnet3bin/nnet3-get-lda-stats.cc → src/nnet3bin/nnet3-acc-lda-stats.cc b/src/nnet3bin/nnet3-get-lda-stats.cc → src/nnet3bin/nnet3-acc-lda-stats.cc
@@ -1,4 +1,4 @@
-// nnet3bin/nnet3-get-lda-stats.cc
+// nnet3bin/nnet3-acc-lda-stats.cc
 
 // Copyright 2015  Johns Hopkins University (author: Daniel Povey)
 
@@ -125,16 +125,14 @@ int main(int argc, char *argv[]) {
         "training examples is used for the class labels.  Used in obtaining\n"
         "feature transforms that help nnet training work better.\n"
         "\n"
-        "Usage:  nnet3-get-lda-stats [options] <raw-nnet-in> <training-examples-in> <lda-stats-out>\n"
+        "Usage:  nnet3-acc-lda-stats [options] <raw-nnet-in> <training-examples-in> <lda-stats-out>\n"
         "e.g.:\n"
-        "nnet3-get-lda-stats 0.raw ark:1.egs 1.acc\n"
+        "nnet3-acc-lda-stats 0.raw ark:1.egs 1.acc\n"
         "See also: nnet-get-feature-transform\n";
 
     bool binary_write = true;
     BaseFloat rand_prune = 0.0;
 
-    LdaEstimate lda;
-
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
     po.Register("rand-prune", &rand_prune,
-Original file line number
+Diff line change
@@ Expand Up / @@ -108,7 +108,6 @@ for x in test_eval92 test_eval93 test_dev93 train_si284; do @@
      steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
     done
     utils/subset_data_dir.sh --first data/train_si284 7138 data/train_si84 || exit 1
     # Now make subset with the shortest 2k utterances from si-84.
@@ Expand Down @@