From 922f66f71c0da5c9143840ab6ced382061469b70 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 20 Nov 2017 10:20:31 -0500 Subject: [PATCH] modifications cnn-tdnn architecture for improving wer --- egs/iam/s5/local/chain/run_cnn_1a.sh | 16 +- egs/iam/s5/local/chain/run_cnn_chainali_1a.sh | 30 ++- egs/iam/s5/local/chain/run_cnn_chainali_1b.sh | 244 ++++++++++++++++++ 3 files changed, 271 insertions(+), 19 deletions(-) create mode 100755 egs/iam/s5/local/chain/run_cnn_chainali_1b.sh diff --git a/egs/iam/s5/local/chain/run_cnn_1a.sh b/egs/iam/s5/local/chain/run_cnn_1a.sh index 02df03d1cd6..4c312e49b98 100755 --- a/egs/iam/s5/local/chain/run_cnn_1a.sh +++ b/egs/iam/s5/local/chain/run_cnn_1a.sh @@ -1,10 +1,10 @@ #!/bin/bash # steps/info/chain_dir_info.pl exp/chain/cnn_1a/ -# exp/chain/cnn_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->380 combine=-0.033->-0.025 xent:train/valid[13,20,final]=(-1.07,-1.31,-0.560/-1.30,-1.70,-0.978) logprob:train/valid[13,20,final]=(-0.064,-0.119,-0.011/-0.115,-0.208,-0.096) +# exp/chain/cnn_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.021->-0.015 xent:train/valid[13,20,final]=(-1.05,-0.701,-0.591/-1.30,-1.08,-1.00) logprob:train/valid[13,20,final]=(-0.061,-0.034,-0.030/-0.107,-0.101,-0.098) # head exp/chain/cnn_1a/decode_test/scoring_kaldi/best_wer -#%WER 18.34 [ 3231 / 17616, 348 ins, 693 del, 2190 sub ] exp/chain/cnn_1a/decode_test/wer_8_1.0 +# WER 19.10 [ 3365 / 17616, 225 ins, 891 del, 2249 sub ] exp/chain/cnn_1a/decode_test/wer_10_0.5 set -e -o pipefail @@ -29,8 +29,8 @@ alignment_subsampling_factor=1 chunk_width=340,300,200,100 num_leaves=500 # we don't need extra left/right context for TDNN systems. -chunk_left_context=32 -chunk_right_context=32 +chunk_left_context=0 +chunk_right_context=0 tdnn_dim=450 # training options srand=0 @@ -127,8 +127,8 @@ if [ $stage -le 4 ]; then num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - common1="required-time-offsets=0 height-offsets=-2,-1,0,1,2 num-filters-out=36" - common2="required-time-offsets=0 height-offsets=-2,-1,0,1,2 num-filters-out=70" + common1="height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="height-offsets=-2,-1,0,1,2 num-filters-out=70" mkdir -p $dir/configs cat < $dir/configs/network.xconfig input dim=40 name=input @@ -223,8 +223,8 @@ if [ $stage -le 7 ]; then steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --extra-left-context $chunk_left_context \ --extra-right-context $chunk_right_context \ - --extra-left-context-initial 32 \ - --extra-right-context-final 32 \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$decode_cmd" \ $dir/graph data/test $dir/decode_test || exit 1; diff --git a/egs/iam/s5/local/chain/run_cnn_chainali_1a.sh b/egs/iam/s5/local/chain/run_cnn_chainali_1a.sh index 0f0296d7af6..ff822cbb82c 100755 --- a/egs/iam/s5/local/chain/run_cnn_chainali_1a.sh +++ b/egs/iam/s5/local/chain/run_cnn_chainali_1a.sh @@ -1,13 +1,21 @@ #!/bin/bash -# chainali_1a uses chain model for lattice instead of gmm-hmm model. It has more cnn layers as compared to 1a -# (18.34% -> 13.68%) +# chainali_1a is as 1a except it uses chain alignments (using 1a system) instead of gmm alignments + +# ./local/chain/compare_wer.sh exp/chain/cnn_chainali_1a/ exp/chain/cnn_1a/ +# System cnn_chainali_1a cnn_1a +# WER 15.85 19.10 +# Final train prob -0.0128 -0.0297 +# Final valid prob -0.0447 -0.0975 +# Final train prob (xent) -0.6448 -0.5915 +# Final valid prob (xent) -0.9924 -1.0022 # steps/info/chain_dir_info.pl exp/chain/cnn1a_chainali/ -# exp/chain/cnn_chainali_1a/: num-iters=21 nj=2..4 num-params=3.8M dim=40->380 combine=-0.009->-0.006 xent:train/valid[13,20,final]=(-0.870,-0.593,-0.568/-1.08,-0.889,-0.874) logprob:train/valid[13,20,final]=(-0.035,-0.003,-0.001/-0.077,-0.055,-0.054) +# exp/chain/cnn_chainali_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.002->0.000 xent:train/valid[13,20,final]=(-0.929,-0.711,-0.645/-1.16,-1.04,-0.992) logprob:train/valid[13,20,final]=(-0.029,-0.016,-0.013/-0.051,-0.047,-0.045) # head exp/chain/cnn_chainali_1a/decode_test/scoring_kaldi/best_wer -# %WER 13.68 [ 2410 / 17616, 243 ins, 633 del, 1534 sub ] exp/chain/cnn_chainali_1a/decode_test/wer_8_1.0 +# %WER 15.85 [ 2793 / 17616, 235 ins, 557 del, 2001 sub ] exp/chain/cnn_chainali_1a/decode_test/wer_9_0.0 +# %WER 7.76 [ 5114 / 65921, 834 ins, 1355 del, 2925 sub ] exp/chain/cnn_chainali_1a/decode_test/cer_9_0.5 set -e -o pipefail @@ -33,8 +41,8 @@ alignment_subsampling_factor=1 chunk_width=340,300,200,100 num_leaves=500 # we don't need extra left/right context for TDNN systems. -chunk_left_context=32 -chunk_right_context=32 +chunk_left_context=0 +chunk_right_context=0 tdnn_dim=450 # training options srand=0 @@ -131,9 +139,9 @@ if [ $stage -le 4 ]; then num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - common1="required-time-offsets=0 height-offsets=-2,-1,0,1,2 num-filters-out=36" - common2="required-time-offsets=0 height-offsets=-2,-1,0,1,2 num-filters-out=70" - common3="required-time-offsets=0 height-offsets=-1,0,1 num-filters-out=70" + common1="height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="height-offsets=-1,0,1 num-filters-out=70" mkdir -p $dir/configs cat < $dir/configs/network.xconfig input dim=40 name=input @@ -228,8 +236,8 @@ if [ $stage -le 7 ]; then steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --extra-left-context $chunk_left_context \ --extra-right-context $chunk_right_context \ - --extra-left-context-initial 32 \ - --extra-right-context-final 32 \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$decode_cmd" \ $dir/graph data/test $dir/decode_test || exit 1; diff --git a/egs/iam/s5/local/chain/run_cnn_chainali_1b.sh b/egs/iam/s5/local/chain/run_cnn_chainali_1b.sh new file mode 100755 index 00000000000..ad144651783 --- /dev/null +++ b/egs/iam/s5/local/chain/run_cnn_chainali_1b.sh @@ -0,0 +1,244 @@ +#!/bin/bash + +# chainali_1b is as chainali_1a except it has 3 more cnn layers and 1 less tdnn layer. +# ./local/chain/compare_wer.sh exp/chain/cnn_chainali_1a/ exp/chain/cnn_chainali_1b/ +# System cnn_chainali_1a cnn_chainali_1b +# WER 15.85 14.51 +# Final train prob -0.0128 -0.0112 +# Final valid prob -0.0447 -0.0375 +# Final train prob (xent) -0.6448 -0.6230 +# Final valid prob (xent) -0.9924 -0.9399 + +# steps/info/chain_dir_info.pl exp/chain/chainali_cnn_1b/ +# exp/chain/chainali_cnn_1b/: num-iters=21 nj=2..4 num-params=4.0M dim=40->364 combine=-0.009->-0.005 xent:train/valid[13,20,final]=(-1.47,-0.728,-0.623/-1.69,-1.02,-0.940) logprob:train/valid[13,20,final]=(-0.068,-0.030,-0.011/-0.086,-0.056,-0.038) + +# %WER 14.51 [ 2556 / 17616, 210 ins, 573 del, 1773 sub ] exp/chain/cnn_chainali_1b/decode_test/wer_10_0.0 +# %WER 7.02 [ 4629 / 65921, 742 ins, 1282 del, 2605 sub ] exp/chain/cnn_chainali_1b/decode_test/cer_9_0.0 + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +chain_model_dir=exp/chain${nnet3_affix}/cnn${affix} +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +alignment_subsampling_factor=1 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=false +lang_test=lang_test +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + local/chain/align_nnet3_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ + data/$lang_test $chain_model_dir $lat_dir + cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=$alignment_subsampling_factor \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$decode_cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi