diff --git a/egs/hkust/s5/RESULTS b/egs/hkust/s5/RESULTS index 9447f40dd39..674dcea38d1 100644 --- a/egs/hkust/s5/RESULTS +++ b/egs/hkust/s5/RESULTS @@ -7,3 +7,6 @@ exp/tri5a/decode/cer_13:%WER 49.67 [ 27891 / 56154, 2877 ins, 4538 del, 20476 su exp/tri5a_mce/decode/cer_11:%WER 44.74 [ 25125 / 56154, 2112 ins, 4108 del, 18905 sub ] exp/tri5a_mmi_b0.1/decode/cer_11:%WER 44.24 [ 24840 / 56154, 2060 ins, 4118 del, 18662 sub ] exp/tri5a_mpe/decode/cer_12:%WER 44.96 [ 25247 / 56154, 2233 ins, 4174 del, 18840 sub ] + +# ConvNet with 2 convolutional layers and 2 ReLU layers +exp/nnet2_convnet/decode/cer_10:%WER 40.73 [ 22873 / 56154, 2609 ins, 3712 del, 16552 sub ] diff --git a/egs/hkust/s5/local/nnet2/run_convnet.sh b/egs/hkust/s5/local/nnet2/run_convnet.sh new file mode 100755 index 00000000000..f5baab0dc5d --- /dev/null +++ b/egs/hkust/s5/local/nnet2/run_convnet.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +# 2015 Xingyu Na +# This script runs on the full training set, using ConvNet setup on top of +# fbank features, on GPU. The ConvNet has four hidden layers, two convolutional +# layers and two affine transform layers with ReLU nonlinearity. +# Convolutional layer [1]: +# convolution1d, input feature dim is 36, filter dim is 7, output dim is +# 30, 128 filters are used +# maxpooling, 3-to-1 maxpooling, input dim is 30, output dim is 10 +# Convolutional layer [2]: +# convolution1d, input feature dim is 10, filter dim is 4, output dim is +# 7, 256 filters are used +# Affine transform layers [3-4]: +# affine transform with ReLU nonlinearity. + +temp_dir= +dir=exp/nnet2_convnet +stage=-5 +train_original=data/train +train=data-fb/train + +. ./cmd.sh +. ./path.sh + +. utils/parse_options.sh + +parallel_opts="--gpu 1" # This is suitable for the CLSP network, you'll + # likely have to change it. + +# Make the FBANK features +if [ $stage -le -5 ]; then + # Dev set + utils/copy_data_dir.sh data/dev data-fb/dev || exit 1; rm $train/{cmvn,feats}.scp + steps/make_fbank.sh --nj 10 --cmd "$train_cmd" \ + data-fb/dev data-fb/dev/log data-fb/dev/data || exit 1; + steps/compute_cmvn_stats.sh data-fb/dev data-fb/dev/log data-fb/dev/data || exit 1; + # Training set + utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp + steps/make_fbank.sh --nj 10 --cmd "$train_cmd" \ + $train $train/log $train/data || exit 1; + steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1; +fi + +( + if [ ! -f $dir/final.mdl ]; then + steps/nnet2/train_convnet_accel2.sh --parallel-opts "$parallel_opts" \ + --cmd "$decode_cmd" --stage $stage \ + --num-threads 1 --minibatch-size 512 \ + --mix-up 20000 --samples-per-iter 300000 \ + --num-epochs 15 --delta-order 2 \ + --initial-effective-lrate 0.0005 --final-effective-lrate 0.000025 \ + --num-jobs-initial 3 --num-jobs-final 8 --splice-width 5 \ + --hidden-dim 2000 --num-filters1 128 --patch-dim1 7 --pool-size 3 \ + --num-filters2 256 --patch-dim2 4 \ + $train data/lang exp/tri5a_ali $dir || exit 1; + fi + + steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \ + --config conf/decode.config \ + exp/tri5a/graph data-fb/dev \ + $dir/decode || exit 1; +) diff --git a/egs/wsj/s5/steps/nnet2/decode.sh b/egs/wsj/s5/steps/nnet2/decode.sh index df8600df32b..753411f4563 100755 --- a/egs/wsj/s5/steps/nnet2/decode.sh +++ b/egs/wsj/s5/steps/nnet2/decode.sh @@ -84,7 +84,12 @@ fi splice_opts=`cat $srcdir/splice_opts 2>/dev/null` case $feat_type in - raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";; + raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" + if [ -f $srcdir/delta_order ]; then + delta_order=`cat $srcdir/delta_order 2>/dev/null` + feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |" + fi + ;; lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" ;; *) echo "$0: invalid feature type $feat_type" && exit 1; diff --git a/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh b/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh new file mode 100755 index 00000000000..1c34749ba7f --- /dev/null +++ b/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh @@ -0,0 +1,674 @@ +#!/bin/bash + +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey). +# 2013 Xiaohui Zhang +# 2013 Guoguo Chen +# 2014 Vimal Manohar +# Apache 2.0. + +# train_convnet_accel2.sh is modified from train_pnorm_accel2.sh. It propotypes +# the training of a ConvNet. The ConvNet is composed of 4 layers. The first layer +# is a Convolutional1d component plus a Maxpooling component. The second layer +# is a single Convolutional1d component. The third and fourth layers are affine +# components with ReLU nonlinearities. Due to non-squashing output, normalize +# component is applied to all four layers. + +# train_pnorm_accel2.sh is a modified form of train_pnorm_simple2.sh (the "2" +# suffix is because they both use the the "new" egs format, created by +# get_egs2.sh). The "accel" part of the name refers to the fact that this +# script uses a number of jobs that can increase during training. You can +# specify --initial-num-jobs and --final-num-jobs to control these separately. +# Also, in this script, the learning rates specified by --initial-learning-rate +# and --final-learning-rate are the "effective learning rates" (defined as the +# learning rate divided by the number of jobs), and the actual learning rates +# used will be the specified learning rates multiplied by the current number +# of jobs. You'll want to set these lower than you normally would previously +# have set the learning rates, by a factor equal to the (previous) number of +# jobs. + + +# Begin configuration section. +cmd=run.pl +num_epochs=15 # Number of epochs of training; + # the number of iterations is worked out from this. +initial_effective_lrate=0.01 +final_effective_lrate=0.001 +bias_stddev=0.5 +hidden_dim=3000 +minibatch_size=128 # by default use a smallish minibatch size for neural net + # training; this controls instability which would otherwise + # be a problem with multi-threaded update. + +samples_per_iter=400000 # each iteration of training, see this many samples + # per job. This option is passed to get_egs.sh +num_jobs_initial=1 # Number of neural net jobs to run in parallel at the start of training. +num_jobs_final=8 # Number of jobs to run in parallel at the end of training. + +prior_subset_size=10000 # 10k samples per job, for computing priors. Should be + # more than enough. +num_jobs_compute_prior=10 # these are single-threaded, run on CPU. +get_egs_stage=0 +online_ivector_dir= + + +max_models_combine=20 # The "max_models_combine" is the maximum number of models we give + # to the final 'combine' stage, but these models will themselves be averages of + # iteration-number ranges. + +shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples + # on each iter. You could set it to 0 or to a large value for complete + # randomization, but this would both consume memory and cause spikes in + # disk I/O. Smaller is easier on disk and memory but less random. It's + # not a huge deal though, as samples are anyway randomized right at the start. + # (the point of this is to get data in different minibatches on different iterations, + # since in the preconditioning method, 2 samples in the same minibatch can + # affect each others' gradients. + +add_layers_period=2 # by default, add new layers every 2 iterations. +stage=-3 + +splice_width=4 # meaning +- 4 frames on each side for second LDA +left_context= # if set, overrides splice-width +right_context= # if set, overrides splice-width. +randprune=4.0 # speeds up LDA. +alpha=4.0 # relates to preconditioning. +update_period=4 # relates to online preconditioning: says how often we update the subspace. +num_samples_history=2000 # relates to online preconditioning +max_change_per_sample=0.075 +precondition_rank_in=20 # relates to online preconditioning +precondition_rank_out=80 # relates to online preconditioning + +num_filters1=128 # number of filters in the first convolutional layer +patch_step1=1 # patch step of the first convolutional layer +patch_dim1=7 # dim of convolutional kernel in the first layer +pool_size=3 # size of pooling after the first convolutional layer +num_filters2=256 # number of filters in the second convolutional layer +patch_dim2=4 # dim of convolutional kernel in the second layer + +mix_up=0 # Number of components to mix up to (should be > #tree leaves, if + # specified.) +num_threads=16 +parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G" + # by default we use 16 threads; this lets the queue know. + # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads. +combine_num_threads=8 +combine_parallel_opts="-pe smp 8" # queue options for the "combine" stage. +cleanup=true +egs_dir= +lda_opts= +lda_dim= +egs_opts= +delta_order= +io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. +transform_dir= # If supplied, overrides alidir +postdir= +cmvn_opts= # will be passed to get_lda.sh and get_egs.sh, if supplied. + # only relevant for "raw" features, not lda. +feat_type= # Can be used to force "raw" features. +align_cmd= # The cmd that is passed to steps/nnet2/align.sh +align_use_gpu= # Passed to use_gpu in steps/nnet2/align.sh [yes/no] +realign_times= # List of times on which we realign. Each time is + # floating point number strictly between 0 and 1, which + # will be multiplied by the num-iters to get an iteration + # number. +num_jobs_align=30 # Number of jobs for realignment +srand=0 # random seed used to initialize the nnet +# End configuration section. + + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --num-epochs <#epochs|15> # Number of epochs of training" + echo " --initial-effective-lrate # effective learning rate at start of training," + echo " # actual learning-rate is this time num-jobs." + echo " --final-effective-lrate # effective learning rate at end of training." + echo " --add-layers-period <#iters|2> # Number of iterations between adding hidden layers" + echo " --mix-up <#pseudo-gaussians|0> # Can be used to have multiple targets in final output layer," + echo " # per context-dependent state. Try a number several times #states." + echo " --num-jobs-initial # Number of parallel jobs to use for neural net training, at the start." + echo " --num-jobs-final # Number of parallel jobs to use for neural net training, at the end" + echo " --num-threads # Number of parallel threads per job (will affect results" + echo " # as well as speed; may interact with batch size; if you increase" + echo " # this, you may want to decrease the batch size." + echo " --parallel-opts # extra options to pass to e.g. queue.pl for processes that" + echo " # use multiple threads... note, you might have to reduce mem_free,ram_free" + echo " # versus your defaults, because it gets multiplied by the -pe smp argument." + echo " --io-opts # Options given to e.g. queue.pl for jobs that do a lot of I/O." + echo " --minibatch-size # Size of minibatch to process (note: product with --num-threads" + echo " # should not get too large, e.g. >2k)." + echo " --samples-per-iter <#samples|400000> # Number of samples of data to process per iteration, per" + echo " # process." + echo " --splice-width # Number of frames on each side to append for feature input" + echo " # (note: we splice processed, typically 40-dimensional frames" + echo " --realign-epochs # A list of space-separated epoch indices the beginning of which" + echo " # realignment is to be done" + echo " --align-cmd (utils/run.pl|utils/queue.pl ) # passed to align.sh" + echo " --align-use-gpu (yes/no) # specify is gpu is to be used for realignment" + echo " --num-jobs-align <#njobs|30> # Number of jobs to perform realignment" + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + echo "ConvNet configurations" + echo " --num-filters1 # number of filters in the first convolutional layer." + echo " --patch-step1 # patch step of the first convolutional layer." + echo " --patch-dim1 # dim of convolutional kernel in the first layer." + echo " # (note: (feat-dim - patch-dim1) % patch-step1 should be 0.)" + echo " --pool-size # size of pooling after the first convolutional layer." + echo " # (note: (feat-dim - patch-dim1 + 1) % pool-size should be 0.)" + echo " --num-filters2 # number of filters in the second convolutional layer." + echo " --patch-dim2 # dim of convolutional kernel in the second layer." + + + exit 1; +fi + +data=$1 +lang=$2 +alidir=$3 +dir=$4 + +if [ ! -z "$realign_times" ]; then + [ -z "$align_cmd" ] && echo "$0: realign_times specified but align_cmd not specified" && exit 1 + [ -z "$align_use_gpu" ] && echo "$0: realign_times specified but align_use_gpu not specified" && exit 1 +fi + +# Check some files. +for f in $data/feats.scp $lang/L.fst $alidir/final.mdl $alidir/tree; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +[ ! -f $postdir/post.1.scp ] && [ ! -f $alidir/ali.1.gz ] && echo "$0: no (soft) alignments provided" && exit 1; + +trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM + +# Set some variables. +num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1 +[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1 +[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1 + +nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir... +# in this dir we'll have just one job. +sdata=$data/split$nj +utils/split_data.sh $data $nj + +mkdir -p $dir/log +echo $nj > $dir/num_jobs +cp $alidir/tree $dir + +extra_opts=() +[ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts") +[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type) +[ ! -z "$delta_order" ] && extra_opts+=(--delta-order $delta_order) +[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir) +[ -z "$transform_dir" ] && transform_dir=$alidir +extra_opts+=(--transform-dir $transform_dir) +[ -z "$left_context" ] && left_context=$splice_width +[ -z "$right_context" ] && right_context=$splice_width +extra_opts+=(--left-context $left_context --right-context $right_context) + +feat-to-dim scp:$sdata/1/feats.scp - > $dir/feat_dim +feat_dim=$(cat $dir/feat_dim) || exit 1; + +if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then + echo "$0: calling get_egs2.sh" + steps/nnet2/get_egs2.sh $egs_opts "${extra_opts[@]}" --io-opts "$io_opts" \ + --postdir "$postdir" --samples-per-iter $samples_per_iter --stage $get_egs_stage \ + --cmd "$cmd" --feat-type "raw" $data $alidir $dir/egs || exit 1; +fi + +if [ -f $dir/egs/cmvn_opts ]; then + cp $dir/egs/cmvn_opts $dir +fi + +if [ -f $dir/egs/delta_order ]; then + cp $dir/egs/delta_order $dir +fi + +if [ -z $egs_dir ]; then + egs_dir=$dir/egs +fi + +frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } +num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } + +# num_archives_expanded considers each separate label-position from +# 0..frames_per_eg-1 to be a separate archive. +num_archives_expanded=$[$num_archives*$frames_per_eg] + +[ $num_jobs_initial -gt $num_jobs_final ] && \ + echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1; + +[ $num_jobs_final -gt $num_archives_expanded ] && \ + echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1; + +if ! [ $num_hidden_layers -ge 1 ]; then + echo "Invalid num-hidden-layers $num_hidden_layers" + exit 1 +fi + +if [ $stage -le -2 ]; then + echo "$0: initializing neural net"; + tot_splice=$[($delta_order+1)*($left_context+1+$right_context)] + delta_feat_dim=$[($delta_order+1)*$feat_dim] + tot_input_dim=$[$feat_dim*$tot_splice] + num_patch1=$[1+($feat_dim-$patch_dim1)/$patch_step1] + num_pool=$[$num_patch1/$pool_size] + patch_dim2=$[$patch_dim2*$num_filters1] + patch_step2=$num_filters1 + patch_stride2=$[$num_pool*$num_filters1] # same as pool outputs + num_patch2=$[1+($num_pool*$num_filters1-$patch_dim2)/$patch_step2] + conv_out_dim1=$[$num_filters1*$num_patch1] # 128 x (36 - 7 + 1) + pool_out_dim=$[$num_filters1*$num_pool] + conv_out_dim2=$[$num_filters2*$num_patch2] + + online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample" + + initial_lrate=$(perl -e "print ($initial_effective_lrate*$num_jobs_initial);") + stddev=`perl -e "print 1.0/sqrt($hidden_dim);"` + cat >$dir/nnet.config <$dir/replace.1.config <$dir/replace.2.config <