diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4q.sh b/egs/swbd/s5c/local/chain/run_tdnn_4q.sh new file mode 100755 index 00000000000..7d91e5e66f5 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4q.sh @@ -0,0 +1,181 @@ +#!/bin/bash + +# this is based on Dan's tdnn_2o script +# it has a different splicing configuration +# it uses the PerDimensionWeightedAverage pooling in place of the Jesus layer + +# it relies on new steps/nnet3/chain/train_tdnn_b.sh script which accepts more +# parameters are calls new config generator steps/nnet3/tdnn/make_configs.py +# which is more in line with other config generators. + +set -e + +#%WER 11.1 | 1831 21395 | 90.2 6.3 3.5 1.3 11.1 46.6 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys +#%WER 16.6 | 4459 42989 | 85.2 9.5 5.3 1.8 16.6 53.4 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +#%WER 15.59 [ 7671 / 49204, 883 ins, 2234 del, 4554 sub ] exp/chain/tdnn_v1_trial6_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 + + +# configs for 'chain' +affix= +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4q # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,0,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" +# smoothing options +pool_window=7 +pool_type='per-dim-weighted-average' +pool_lpfilter_width= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=64 +relu_dim=700 +frames_per_eg=150 +remove_egs=false +common_egs_dir= + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn_b.sh --stage $train_stage \ + --apply-deriv-weights false \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --pool-type "$pool_type" \ + --pool-window "$pool_window" \ + --pool-lpfilter-width "$pool_lpfilter_width" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim $relu_dim \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + --egs-dir "$common_egs_dir" \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh index 46ba6d13925..ea3898b83da 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh +++ b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh @@ -41,6 +41,7 @@ frames_per_iter=800000 # each iteration of training, see this many [input] # frames per job. This option is passed to get_egs.sh. # Aim for about a minute of training time right_tolerance=10 +left_tolerance=5 denominator_scale=1.0 # relates to tombsone stuff. num_jobs_initial=1 # Number of neural net jobs to run in parallel at the start of training num_jobs_final=8 # Number of neural net jobs to run in parallel at the end of training @@ -73,6 +74,10 @@ exit_stage=-100 # you can set this to terminate the training early. Exits befor # count space-separated fields in splice_indexes to get num-hidden-layers. splice_indexes="-4,-3,-2,-1,0,1,2,3,4 0 -2,2 0 -4,4 0" +pool_type='none' +pool_window= +pool_lpfilter_width= + # Format : layer/....layer/ " # note: hidden layers which are composed of one or more components, # so hidden layer indexing is different from component count @@ -219,8 +224,14 @@ if [ $stage -le -5 ]; then else dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" fi + # create the config files for nnet initialization - python steps/nnet3/make_tdnn_configs.py \ + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + + python steps/nnet3/tdnn/make_configs.py $pool_opts\ --include-log-softmax=false \ --final-layer-normalize-target $final_layer_normalize_target \ --splice-indexes "$splice_indexes" \ @@ -231,6 +242,7 @@ if [ $stage -le -5 ]; then --use-presoftmax-prior-scale false \ $dir/configs || exit 1; fi + # Initialize as "raw" nnet, prior to training the LDA-like preconditioning # matrix. This first config just does any initial splicing that we do; # we do this as it's a convenient way to get the stats for the 'lda-like' @@ -264,6 +276,8 @@ if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then steps/nnet3/chain/get_egs.sh $egs_opts "${extra_opts[@]}" \ --frames-per-iter $frames_per_iter --stage $get_egs_stage \ --cmd "$cmd" \ + --right-tolerance "$right_tolerance" \ + --left-tolerance "$left_tolerance" \ --frames-per-eg $frames_per_eg \ --frame-subsampling-factor $frame_subsampling_factor \ $data $dir $latdir $dir/egs || exit 1; diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py index 87323a1c3e1..0383a9ff6f3 100644 --- a/egs/wsj/s5/steps/nnet3/components.py +++ b/egs/wsj/s5/steps/nnet3/components.py @@ -6,6 +6,45 @@ import sys import warnings import copy +from operator import itemgetter +import numpy as np +try: + import scipy.signal as signal + has_scipy_signal = True +except ImportError: + has_scipy_signal = False + +def WriteKaldiMatrix(matrix, matrix_file_name): + assert(len(matrix.shape) == 2) + # matrix is a numpy array + matrix_file = open(matrix_file_name, "w") + [rows, cols ] = matrix.shape + matrix_file.write('[\n') + for row in range(rows): + matrix_file.write(' '.join( map(lambda x: '{0:f}'.format(x), matrix[row, : ]))) + if row == rows - 1: + matrix_file.write("]") + else: + matrix_file.write('\n') + matrix_file.close() +def GetSumDescriptor(inputs): + sum_descriptors = inputs + while len(sum_descriptors) != 1: + cur_sum_descriptors = [] + pair = [] + while len(sum_descriptors) > 0: + value = sum_descriptors.pop() + if value.strip() != '': + pair.append(value) + if len(pair) == 2: + cur_sum_descriptors.append("Sum({0}, {1})".format(pair[0], pair[1])) + pair = [] + if pair: + cur_sum_descriptors.append(pair[0]) + sum_descriptors = cur_sum_descriptors + return sum_descriptors + + # adds the input nodes and returns the descriptor def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0): @@ -19,11 +58,26 @@ def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0): components.append('input-node name=ivector dim=' + str(ivector_dim)) list.append('ReplaceIndex(ivector, t, 0)') output_dim += ivector_dim - splice_descriptor = "Append({0})".format(", ".join(list)) + if len(list) > 1: + splice_descriptor = "Append({0})".format(", ".join(list)) + else: + splice_descriptor = list[0] print(splice_descriptor) return {'descriptor': splice_descriptor, 'dimension': output_dim} +def AddNoOpLayer(config_lines, name, input): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + + components.append('component name={0}_noop type=NoOpComponent dim={1}'.format(name, input['dimension'])) + component_nodes.append('component-node name={0}_noop component={0}_noop input={1}'.format(name, input['descriptor'])) + + return {'descriptor': '{0}_noop'.format(name), + 'dimension': input['dimension']} + + + def AddLdaLayer(config_lines, name, input, lda_file): components = config_lines['components'] component_nodes = config_lines['component-nodes'] @@ -34,6 +88,30 @@ def AddLdaLayer(config_lines, name, input, lda_file): return {'descriptor': '{0}_lda'.format(name), 'dimension': input['dimension']} +def AddBlockAffineLayer(config_lines, name, input, output_dim, num_blocks): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + assert((input['dimension'] % num_blocks == 0) and + (output_dim % num_blocks == 0)) + components.append('component name={0}_block_affine type=BlockAffineComponent input-dim={1} output-dim={2} num-blocks={3}'.format(name, input['dimension'], output_dim, num_blocks)) + component_nodes.append('component-node name={0}_block_affine component={0}_block_affine input={1}'.format(name, input['descriptor'])) + + return {'descriptor' : '{0}_block_affine'.format(name), + 'dimension' : output_dim} + + +def AddPermuteLayer(config_lines, name, input, column_map): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + permute_indexes = ",".join(map(lambda x: str(x), column_map)) + components.append('component name={0}_permute type=PermuteComponent column-map={1}'.format(name, permute_indexes)) + component_nodes.append('component-node name={0}_permute component={0}_permute input={1}'.format(name, input['descriptor'])) + + return {'descriptor': '{0}_permute'.format(name), + 'dimension': input['dimension']} + + + def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = ""): components = config_lines['components'] component_nodes = config_lines['component-nodes'] @@ -60,6 +138,36 @@ def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options 'dimension': output_dim} +def AddConvolutionLayer(config_lines, name, input, + input_x_dim, input_y_dim, input_z_dim, + filt_x_dim, filt_y_dim, + filt_x_step, filt_y_step, + num_filters, input_vectorization, + param_stddev = None, bias_stddev = None, + filter_bias_file = None, + is_updatable = True): + assert(input['dimension'] == input_x_dim * input_y_dim * input_z_dim) + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + + conv_init_string = "component name={0}_conv type=ConvolutionComponent input-x-dim={1} input-y-dim={2} input-z-dim={3} filt-x-dim={4} filt-y-dim={5} filt-x-step={6} filt-y-step={7} input-vectorization-order={8}".format(name, input_x_dim, input_y_dim, input_z_dim, filt_x_dim, filt_y_dim, filt_x_step, filt_y_step, input_vectorization) + if filter_bias_file is not None: + conv_init_string += " matrix={0}".format(filter_bias_file) + if is_updatable: + conv_init_string += " is-updatable=true" + else: + conv_init_string += " is-updatable=false" + + components.append(conv_init_string) + component_nodes.append("component-node name={0}_conv_t component={0}_conv input={1}".format(name, input['descriptor'])) + + num_x_steps = (1 + (input_x_dim - filt_x_dim) / filt_x_step) + num_y_steps = (1 + (input_y_dim - filt_y_dim) / filt_y_step) + output_dim = num_x_steps * num_y_steps * num_filters; + return {'descriptor': '{0}_conv_t'.format(name), + 'dimension': output_dim} + + def AddSoftmaxLayer(config_lines, name, input): components = config_lines['components'] @@ -72,7 +180,7 @@ def AddSoftmaxLayer(config_lines, name, input): 'dimension': input['dimension']} -def AddOutputNode(config_lines, input, label_delay=None): +def AddOutputLayer(config_lines, input, label_delay=None): components = config_lines['components'] component_nodes = config_lines['component-nodes'] if label_delay is None: @@ -80,12 +188,18 @@ def AddOutputNode(config_lines, input, label_delay=None): else: component_nodes.append('output-node name=output input=Offset({0},{1})'.format(input['descriptor'], label_delay)) -def AddFinalLayer(config_lines, input, output_dim, ng_affine_options = "", label_delay=None, include_softmax = "true"): +def AddFinalLayer(config_lines, input, output_dim, ng_affine_options = " param-stddev=0 bias-stddev=0 ", label_delay=None, use_presoftmax_prior_scale = False, prior_scale_file = None, include_log_softmax = True): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + prev_layer_output = AddAffineLayer(config_lines, "Final", input, output_dim, ng_affine_options) - if include_softmax == "true": - prev_layer_output = AddSoftmaxLayer(config_lines, "Final", prev_layer_output) - AddOutputNode(config_lines, prev_layer_output, label_delay) - + if include_log_softmax: + if use_presoftmax_prior_scale : + components.append('component name=Final-fixed-scale type=FixedScaleComponent scales={0}'.format(prior_scale_file)) + component_nodes.append('component-node name=Final-fixed-scale component=Final-fixed-scale input={0}'.format(prev_layer_output['descriptor'])) + prev_layer_output['descriptor'] = "Final-fixed-scale" + prev_layer_output = AddSoftmaxLayer(config_lines, "Final", prev_layer_output) + AddOutputLayer(config_lines, prev_layer_output, label_delay) def AddLstmLayer(config_lines, name, input, cell_dim, diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py new file mode 100755 index 00000000000..e100ac0f2af --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py @@ -0,0 +1,321 @@ +#!/usr/bin/env python + +# we're using python 3.x style print but want it to work in python 2.x, +from __future__ import print_function +import os +import argparse +import sys +import warnings +import copy +import imp +import ast +import scipy.signal as signal +import numpy as np + +nodes = imp.load_source('', 'steps/nnet3/nodes.py') + + +def AddPerDimAffineLayer(config_lines, name, input, input_window): + filter_context = int((input_window - 1) / 2) + filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1) + list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes] + filter_input_descriptor = 'Append({0})'.format(' , '.join(list)) + filter_input_descriptor = {'descriptor':filter_input_descriptor, + 'dimension':len(filter_input_splice_indexes) * input['dimension']} + + + # add permute component to shuffle the feature columns of the Append + # descriptor output so that columns corresponding to the same feature index + # are contiguous add a block-affine component to collapse all the feature + # indexes across time steps into a single value + num_feats = input['dimension'] + num_times = len(filter_input_splice_indexes) + column_map = [] + for i in range(num_feats): + for j in range(num_times): + column_map.append(j * num_feats + i) + permuted_output_descriptor = nodes.AddPermuteLayer(config_lines, + name, filter_input_descriptor, column_map) + + # add a block-affine component + output_descriptor = nodes.AddBlockAffineComponent(config_lines, name, + permuted_output_descriptor, + num_feats, num_feats) + + return [output_descriptor, filter_context, filter_context] + + +def AddLpFilter(config_lines, name, input, rate, num_lpfilter_taps, lpfilt_filename, is_updatable = False): + # low-pass smoothing of input was specified. so we will add a low-pass filtering layer + lp_filter = signal.firwin(num_lpfilter_taps, rate, width=None, window='hamming', pass_zero=True, scale=True, nyq=1.0) + lp_filter = np.append(lp_filter, 0) + nodes.WriteKaldiMatrix(np.array([lp_filter]), lpfilt_filename) + filter_context = int((num_lpfilter_taps - 1) / 2) + filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1) + list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes] + filter_input_descriptor = 'Append({0})'.format(' , '.join(list)) + filter_input_descriptor = {'descriptor':filter_input_descriptor, + 'dimension':len(filter_input_splice_indexes) * input['dimension']} + + input_x_dim = len(filter_input_splice_indexes) + input_y_dim = input['dimension'] + input_z_dim = 1 + filt_x_dim = len(filter_input_splice_indexes) + filt_y_dim = 1 + filt_x_step = 1 + filt_y_step = 1 + input_vectorization = 'zyx' + + tdnn_input_descriptor = nodes.AddConvolutionLayer(config_lines, name, + filter_input_descriptor, + input_x_dim, input_y_dim, input_z_dim, + filt_x_dim, filt_y_dim, + filt_x_step, filt_y_step, + 1, input_vectorization, + filter_bias_file = lpfilt_filename, + is_updatable = is_updatable) + + + return [tdnn_input_descriptor, filter_context, filter_context] + + + +def PrintConfig(file_name, config_lines): + f = open(file_name, 'w') + f.write("\n".join(config_lines['components'])+"\n") + f.write("\n#Component nodes\n") + f.write("\n".join(config_lines['component-nodes'])) + f.close() + +def ParseSpliceString(splice_indexes, label_delay=None): + ## Work out splice_array e.g. splice_array = [ [ -3,-2,...3 ], [0], [-2,2], .. [ -8,8 ] ] + splice_array = [] + left_context = 0 + right_context = 0 + split1 = args.splice_indexes.split(" "); # we already checked the string is nonempty. + if len(split1) < 1: + sys.exit("invalid --splice-indexes argument, too short: " + + args.splice_indexes) + try: + for string in split1: + split2 = string.split(",") + if len(split2) < 1: + sys.exit("invalid --splice-indexes argument, too-short element: " + + args.splice_indexes) + int_list = [] + for int_str in split2: + int_list.append(int(int_str)) + if not int_list == sorted(int_list): + sys.exit("elements of --splice-indexes must be sorted: " + + args.splice_indexes) + left_context += -int_list[0] + right_context += int_list[-1] + splice_array.append(int_list) + except ValueError as e: + sys.exit("invalid --splice-indexes argument " + args.splice_indexes + e) + left_context = max(0, left_context) + right_context = max(0, right_context) + num_hidden_layers = len(splice_array) + input_dim = len(splice_array[0]) * args.feat_dim + args.ivector_dim + + return {'left_context':left_context, + 'right_context':right_context, + 'splice_indexes':splice_array, + 'num_hidden_layers':len(splice_array) + } + +if __name__ == "__main__": + # we add compulsary arguments as named arguments for readability + parser = argparse.ArgumentParser(description="Writes config files and variables " + "for TDNNs creation and training", + epilog="See steps/nnet3/tdnn/train.sh for example.") + # General neural network options + parser.add_argument("--splice-indexes", type=str, + help="Splice indexes at input layer, e.g. '-3,-2,-1,0,1,2,3' [compulsary argument]", default="0") + parser.add_argument("--feat-dim", type=int, + help="Raw feature dimension, e.g. 13") + parser.add_argument("--ivector-dim", type=int, + help="iVector dimension, e.g. 100", default=0) + parser.add_argument("--include-log-softmax", type=str, + help="add the final softmax layer ", default="true", choices = ["false", "true"]) + parser.add_argument("--final-layer-normalize-target", type=float, + help="RMS target for final layer (set to <1 if final layer learns too fast", + default=1.0) + parser.add_argument("--subset-dim", type=int, default=0, + help="dimension of the subset of units to be sent to the central frame") + parser.add_argument("--pnorm-input-dim", type=int, + help="input dimension to p-norm nonlinearities") + parser.add_argument("--pnorm-output-dim", type=int, + help="output dimension of p-norm nonlinearities") + parser.add_argument("--relu-dim", type=int, + help="dimension of ReLU nonlinearities") + parser.add_argument("--pool-type", type=str, default = 'none', + help="Type of pooling to be used.", choices = ['low-pass', 'sum', 'max', 'weighted-average', 'per-dim-weighted-average', 'none']) + parser.add_argument("--pool-window", type=int, default = None, + help="Width of the pooling window") + parser.add_argument("--pool-lpfilter-width", type=float, + default = None, help="Nyquist frequency of the lpfilter to be used for pooling") + parser.add_argument("--use-presoftmax-prior-scale", type=str, + help="if true, a presoftmax-prior-scale is added", + choices=['true', 'false'], default = "true") + parser.add_argument("--num-targets", type=int, + help="number of network targets (e.g. num-pdf-ids/num-leaves)") + parser.add_argument("config_dir", + help="Directory to write config files and variables") + + print(' '.join(sys.argv)) + + args = parser.parse_args() + + if not os.path.exists(args.config_dir): + os.makedirs(args.config_dir) + + ## Check arguments. + if args.splice_indexes is None: + sys.exit("--splice-indexes argument is required") + if args.feat_dim is None or not (args.feat_dim > 0): + sys.exit("--feat-dim argument is required") + if args.num_targets is None or not (args.num_targets > 0): + sys.exit("--num-targets argument is required") + if (args.subset_dim < 0): + sys.exit("--subset-dim has to be non-negative") + if (args.pool_window is not None) and (args.pool_window <= 0): + sys.exit("--pool-window has to be positive") + + if not args.relu_dim is None: + if not args.pnorm_input_dim is None or not args.pnorm_output_dim is None: + sys.exit("--relu-dim argument not compatible with " + "--pnorm-input-dim or --pnorm-output-dim options"); + nonlin_input_dim = args.relu_dim + nonlin_output_dim = args.relu_dim + else: + if not args.pnorm_input_dim > 0 or not args.pnorm_output_dim > 0: + sys.exit("--relu-dim not set, so expected --pnorm-input-dim and " + "--pnorm-output-dim to be provided."); + nonlin_input_dim = args.pnorm_input_dim + nonlin_output_dim = args.pnorm_output_dim + + prior_scale_file = '{0}/presoftmax_prior_scale.vec'.format(args.config_dir) + if args.use_presoftmax_prior_scale == "true": + use_presoftmax_prior_scale = True + else: + use_presoftmax_prior_scale = False + + parsed_splice_output = ParseSpliceString(args.splice_indexes.strip()) + num_hidden_layers = parsed_splice_output['num_hidden_layers'] + splice_indexes = parsed_splice_output['splice_indexes'] + + config_lines = {'components':[], 'component-nodes':[]} + + config_files={} + prev_layer_output = nodes.AddInputLayer(config_lines, args.feat_dim, splice_indexes[0], args.ivector_dim) + + # Add the init config lines for estimating the preconditioning matrices + init_config_lines = copy.deepcopy(config_lines) + init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to') + init_config_lines['components'].insert(0, '# preconditioning matrix computation') + nodes.AddOutputLayer(init_config_lines, prev_layer_output) + config_files[args.config_dir + '/init.config'] = init_config_lines + + prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, args.config_dir + '/lda.mat') + + left_context = 0 + right_context = 0 + # we moved the first splice layer to before the LDA.. + # so the input to the first affine layer is going to [0] index + splice_indexes[0] = [0] + for i in range(0, num_hidden_layers): + # make the intermediate config file for layerwise discriminative training + # if specified, pool the input from the previous layer + + # prepare the spliced input + if not (len(splice_indexes[i]) == 1 and splice_indexes[i][0] == 0): + if args.pool_type != "none" and args.pool_window is None: + raise Exception("Pooling type was specified as {0}, this requires specification of the pool-window".format(args.pool_type)) + if args.pool_type in set(["low-pass", "weighted-average"]): + if args.pool_type == "weighted-average": + lpfilter_is_updatable = True + else: + lpfilter_is_updatable = False + # low-pass filter the input to smooth it before the sub-sampling + [prev_layer_output, cur_left_context, cur_right_context] = AddLpFilter(config_lines, + 'Tdnn_input_smoother_{0}'.format(i), + prev_layer_output, + args.pool_lpfilter_width, + args.pool_window, + args.config_dir + '/Tdnn_input_smoother_{0}.txt'.format(i), + is_updatable = lpfilter_is_updatable) + left_context += cur_left_context + right_context += cur_right_context + + if args.pool_type == "per-dim-weighted-average": + # add permute component to shuffle the feature columns of the Append descriptor output so + # that columns corresponding to the same feature index are contiguous + # add a block-affine component to collapse all the feature indexes across time steps into a single value + [prev_layer_output, cur_left_context, cur_right_context] = AddPerDimAffineLayer(config_lines, + 'Tdnn_input_PDA_{0}'.format(i), + prev_layer_output, + args.pool_window) + + left_context += cur_left_context + right_context += cur_right_context + + if args.pool_type == "sum": + raise NotImplementedError("Sum-pooling has not been tested yet.") + + if args.pool_type == "max" : + raise NotImplementedError("Max-pooling component needs to be reimplemented for this.") + + try: + zero_index = splice_indexes[i].index(0) + except ValueError: + zero_index = None + # I just assume the prev_layer_output_descriptor is a simple forwarding descriptor + prev_layer_output_descriptor = prev_layer_output['descriptor'] + subset_output = prev_layer_output + if args.subset_dim > 0: + # if subset_dim is specified the script expects a zero in the splice indexes + assert(zero_index is not None) + subset_node_config = "dim-range-node name=Tdnn_input_{0} input-node={1} dim-offset={2} dim={3}".format(i, prev_layer_output_descriptor, 0, args.subset_dim) + subset_output = {'descriptor' : 'Tdnn_input_{0}'.format(i), + 'dimension' : args.subset_dim} + config_lines['component-nodes'].append(subset_node_config) + appended_descriptors = [] + appended_dimension = 0 + for j in range(len(splice_indexes[i])): + if j == zero_index: + appended_descriptors.append(prev_layer_output['descriptor']) + appended_dimension += prev_layer_output['dimension'] + continue + appended_descriptors.append('Offset({0}, {1})'.format(subset_output['descriptor'], splice_indexes[i][j])) + appended_dimension += subset_output['dimension'] + prev_layer_output = {'descriptor' : "Append({0})".format(" , ".join(appended_descriptors)), + 'dimension' : appended_dimension} + else: + # this is a normal affine node + pass + prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn_{0}".format(i), + prev_layer_output, nonlin_output_dim, norm_target_rms = 1.0 if i < num_hidden_layers -1 else args.final_layer_normalize_target) + # a final layer is added after each new layer as we are generating configs for layer-wise discriminative training + nodes.AddFinalLayer(config_lines, prev_layer_output, args.num_targets, + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = True if args.include_log_softmax == "true" else False) + + config_files['{0}/layer{1}.config'.format(args.config_dir, i+1)] = config_lines + config_lines = {'components':[], 'component-nodes':[]} + + left_context += int(parsed_splice_output['left_context']) + right_context += int(parsed_splice_output['right_context']) + + # write the files used by other scripts like steps/nnet3/get_egs.sh + f = open(args.config_dir + "/vars", "w") + print('left_context=' + str(left_context), file=f) + print('right_context=' + str(right_context), file=f) + print('num_hidden_layers=' + str(num_hidden_layers), file=f) + f.close() + + # printing out the configs + # init.config used to train lda-mllt train + for key in config_files.keys(): + PrintConfig(key, config_files[key]) diff --git a/egs/wsj/s5/steps/nnet3/tdnn/train.sh b/egs/wsj/s5/steps/nnet3/tdnn/train.sh new file mode 100755 index 00000000000..773e10ccab6 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/tdnn/train.sh @@ -0,0 +1,660 @@ +#!/bin/bash + +# note, TDNN is the same as what we used to call multisplice. + +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# 2013 Xiaohui Zhang +# 2013 Guoguo Chen +# 2014 Vimal Manohar +# 2014 Vijayaditya Peddinti +# Apache 2.0. + + +# Begin configuration section. +cmd=run.pl +num_epochs=15 # Number of epochs of training; + # the number of iterations is worked out from this. +initial_effective_lrate=0.01 +final_effective_lrate=0.001 +pnorm_input_dim=3000 +pnorm_output_dim=300 +relu_dim= # you can use this to make it use ReLU's instead of p-norms. +rand_prune=4.0 # Relates to a speedup we do for LDA. +minibatch_size=512 # This default is suitable for GPU-based training. + # Set it to 128 for multi-threaded CPU-based training. +max_param_change=2.0 # max param change per minibatch +samples_per_iter=400000 # each iteration of training, see this many samples + # per job. This option is passed to get_egs.sh +num_jobs_initial=1 # Number of neural net jobs to run in parallel at the start of training +num_jobs_final=8 # Number of neural net jobs to run in parallel at the end of training +prior_subset_size=20000 # 20k samples per job, for computing priors. +num_jobs_compute_prior=10 # these are single-threaded, run on CPU. +get_egs_stage=0 # can be used for rerunning after partial +online_ivector_dir= +presoftmax_prior_scale_power=-0.25 +use_presoftmax_prior_scale=true +remove_egs=true # set to false to disable removing egs after training is done. + +max_models_combine=20 # The "max_models_combine" is the maximum number of models we give + # to the final 'combine' stage, but these models will themselves be averages of + # iteration-number ranges. + +shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples + # on each iter. You could set it to 0 or to a large value for complete + # randomization, but this would both consume memory and cause spikes in + # disk I/O. Smaller is easier on disk and memory but less random. It's + # not a huge deal though, as samples are anyway randomized right at the start. + # (the point of this is to get data in different minibatches on different iterations, + # since in the preconditioning method, 2 samples in the same minibatch can + # affect each others' gradients. + +add_layers_period=2 # by default, add new layers every 2 iterations. +stage=-6 +exit_stage=-100 # you can set this to terminate the training early. Exits before running this stage + +# count space-separated fields in splice_indexes to get num-hidden-layers. +splice_indexes="-4,-3,-2,-1,0,1,2,3,4 0 -2,2 0 -4,4 0" +# Format : layer/....layer/ " +# note: hidden layers which are composed of one or more components, +# so hidden layer indexing is different from component count +chunk_training=false # if true training is done with chunk randomization, rather than frame randomization + +randprune=4.0 # speeds up LDA. +use_gpu=true # if true, we run on GPU. +cleanup=true +egs_dir= +max_lda_jobs=10 # use no more than 10 jobs for the LDA accumulation. +lda_opts= +egs_opts= +transform_dir= # If supplied, this dir used instead of alidir to find transforms. +cmvn_opts= # will be passed to get_lda.sh and get_egs.sh, if supplied. + # only relevant for "raw" features, not lda. +feat_type=raw # or set to 'lda' to use LDA features. +align_cmd= # The cmd that is passed to steps/nnet2/align.sh +align_use_gpu= # Passed to use_gpu in steps/nnet2/align.sh [yes/no] +realign_times= # List of times on which we realign. Each time is + # floating point number strictly between 0 and 1, which + # will be multiplied by the num-iters to get an iteration + # number. +num_jobs_align=30 # Number of jobs for realignment +# End configuration section. +frames_per_eg=8 # to be passed on to get_egs.sh +subset_dim=0 + +trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --num-epochs <#epochs|15> # Number of epochs of training" + echo " --initial-effective-lrate # effective learning rate at start of training." + echo " --final-effective-lrate # effective learning rate at end of training." + echo " # data, 0.00025 for large data" + echo " --num-hidden-layers <#hidden-layers|2> # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs" + echo " --add-layers-period <#iters|2> # Number of iterations between adding hidden layers" + echo " --presoftmax-prior-scale-power # use the specified power value on the priors (inverse priors) to scale" + echo " # the pre-softmax outputs (set to 0.0 to disable the presoftmax element scale)" + echo " --num-jobs-initial # Number of parallel jobs to use for neural net training, at the start." + echo " --num-jobs-final # Number of parallel jobs to use for neural net training, at the end" + echo " --num-threads # Number of parallel threads per job, for CPU-based training (will affect" + echo " # results as well as speed; may interact with batch size; if you increase" + echo " # this, you may want to decrease the batch size." + echo " --parallel-opts # extra options to pass to e.g. queue.pl for processes that" + echo " # use multiple threads... note, you might have to reduce mem_free,ram_free" + echo " # versus your defaults, because it gets multiplied by the -pe smp argument." + echo " --minibatch-size # Size of minibatch to process (note: product with --num-threads" + echo " # should not get too large, e.g. >2k)." + echo " --samples-per-iter <#samples|400000> # Number of samples of data to process per iteration, per" + echo " # process." + echo " --splice-indexes " + echo " # Frame indices used for each splice layer." + echo " # Format : layer/....layer/ " + echo " # (note: we splice processed, typically 40-dimensional frames" + echo " --lda-dim # Dimension to reduce spliced features to with LDA" + echo " --realign-times # A list of space-separated floating point numbers between 0.0 and" + echo " # 1.0 to specify how far through training realignment is to be done" + echo " --align-cmd (utils/run.pl|utils/queue.pl ) # passed to align.sh" + echo " --align-use-gpu (yes/no) # specify is gpu is to be used for realignment" + echo " --num-jobs-align <#njobs|30> # Number of jobs to perform realignment" + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + + + exit 1; +fi + +data=$1 +lang=$2 +alidir=$3 +dir=$4 + +if [ ! -z "$realign_times" ]; then + [ -z "$align_cmd" ] && echo "$0: realign_times specified but align_cmd not specified" && exit 1 + [ -z "$align_use_gpu" ] && echo "$0: realign_times specified but align_use_gpu not specified" && exit 1 +fi + +# Check some files. +for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + + +# Set some variables. +num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1 +[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1 +[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1 + +nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir... +# in this dir we'll have just one job. +sdata=$data/split$nj +utils/split_data.sh $data $nj + +mkdir -p $dir/log +echo $nj > $dir/num_jobs +cp $alidir/tree $dir + + +# First work out the feature and iVector dimension, needed for tdnn config creation. +case $feat_type in + raw) feat_dim=$(feat-to-dim --print-args=false scp:$data/feats.scp -) || \ + { echo "$0: Error getting feature dim"; exit 1; } + ;; + lda) [ ! -f $alidir/final.mat ] && echo "$0: With --feat-type lda option, expect $alidir/final.mat to exist." + # get num-rows in lda matrix, which is the lda feature dim. + feat_dim=$(matrix-dim --print-args=false $alidir/final.mat | cut -f 1) + ;; + *) + echo "$0: Bad --feat-type '$feat_type';"; exit 1; +esac +if [ -z "$online_ivector_dir" ]; then + ivector_dim=0 +else + ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; +fi + + +if [ $stage -le -5 ]; then + echo "$0: creating neural net configs"; + + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + python steps/nnet3/tdnn/make_configs.py \ + --splice-indexes "$splice_indexes" \ + --subset-dim "$subset_dim" \ + --feat-dim $feat_dim \ + --ivector-dim $ivector_dim \ + $dim_opts \ + --use-presoftmax-prior-scale $use_presoftmax_prior_scale \ + --num-targets $num_leaves \ + $dir/configs || exit 1; + + # Initialize as "raw" nnet, prior to training the LDA-like preconditioning + # matrix. This first config just does any initial splicing that we do; + # we do this as it's a convenient way to get the stats for the 'lda-like' + # transform. + $cmd $dir/log/nnet_init.log \ + nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1; +fi + +# sourcing the "vars" below sets +# left_context=(something) +# right_context=(something) +# num_hidden_layers=(something) +. $dir/configs/vars || exit 1; + +context_opts="--left-context=$left_context --right-context=$right_context" + +! [ "$num_hidden_layers" -gt 0 ] && echo \ + "$0: Expected num_hidden_layers to be defined" && exit 1; + +[ -z "$transform_dir" ] && transform_dir=$alidir + + +if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then + extra_opts=() + [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts") + [ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type) + [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir) + extra_opts+=(--transform-dir $transform_dir) + extra_opts+=(--left-context $left_context) + extra_opts+=(--right-context $right_context) + echo "$0: calling get_egs.sh" + steps/nnet3/get_egs.sh $egs_opts "${extra_opts[@]}" \ + --samples-per-iter $samples_per_iter --stage $get_egs_stage \ + --cmd "$cmd" $egs_opts \ + --frames-per-eg $frames_per_eg \ + $data $alidir $dir/egs || exit 1; +fi + +[ -z $egs_dir ] && egs_dir=$dir/egs + +if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then + echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)"; + exit 1; +fi +if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then + echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $egs_dir/info/ivector_dim)"; + exit 1; +fi + +# copy any of the following that exist, to $dir. +cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null + +# confirm that the egs_dir has the necessary context (especially important if +# the --egs-dir option was used on the command line). +egs_left_context=$(cat $egs_dir/info/left_context) || exit -1 +egs_right_context=$(cat $egs_dir/info/right_context) || exit -1 + ( [ $egs_left_context -lt $left_context ] || \ + [ $egs_right_context -lt $right_context ] ) && \ + echo "$0: egs in $egs_dir have too little context" && exit -1; + +frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } +num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } + +# num_archives_expanded considers each separate label-position from +# 0..frames_per_eg-1 to be a separate archive. +if [ "$chunk_training" == "true" ]; then + num_archives_expanded=$num_archives +else + num_archives_expanded=$[$num_archives*$frames_per_eg] +fi + +[ $num_jobs_initial -gt $num_jobs_final ] && \ + echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1; + +[ $num_jobs_final -gt $num_archives_expanded ] && \ + echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1; + + +if [ $stage -le -3 ]; then + echo "$0: getting preconditioning matrix for input features." + num_lda_jobs=$num_archives + [ $num_lda_jobs -gt $max_lda_jobs ] && num_lda_jobs=$max_lda_jobs + + # Write stats with the same format as stats for LDA. + $cmd JOB=1:$num_lda_jobs $dir/log/get_lda_stats.JOB.log \ + nnet3-acc-lda-stats --rand-prune=$rand_prune \ + $dir/init.raw "ark:$egs_dir/egs.JOB.ark" $dir/JOB.lda_stats || exit 1; + + all_lda_accs=$(for n in $(seq $num_lda_jobs); do echo $dir/$n.lda_stats; done) + $cmd $dir/log/sum_transform_stats.log \ + sum-lda-accs $dir/lda_stats $all_lda_accs || exit 1; + + rm $all_lda_accs || exit 1; + + # this computes a fixed affine transform computed in the way we described in + # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant + # of an LDA transform but without dimensionality reduction. + $cmd $dir/log/get_transform.log \ + nnet-get-feature-transform $lda_opts $dir/lda.mat $dir/lda_stats || exit 1; + + ln -sf ../lda.mat $dir/configs/lda.mat +fi + + +if [ $stage -le -2 ]; then + echo "$0: preparing initial vector for FixedScaleComponent before softmax" + echo " ... using priors^$presoftmax_prior_scale_power and rescaling to average 1" + + # obtains raw pdf count + $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \ + ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ + post-to-tacc --per-pdf=true $alidir/final.mdl ark:- $dir/pdf_counts.JOB || exit 1; + $cmd $dir/log/sum_pdf_counts.log \ + vector-sum --binary=false $dir/pdf_counts.* $dir/pdf_counts || exit 1; + rm $dir/pdf_counts.* + + awk -v power=$presoftmax_prior_scale_power -v smooth=0.01 \ + '{ for(i=2; i<=NF-1; i++) { count[i-2] = $i; total += $i; } + num_pdfs=NF-2; average_count = total/num_pdfs; + for (i=0; i $dir/presoftmax_prior_scale.vec + ln -sf ../presoftmax_prior_scale.vec $dir/configs/presoftmax_prior_scale.vec +fi + +if [ $stage -le -1 ]; then + # Add the first layer; this will add in the lda.mat and + # presoftmax_prior_scale.vec. + $cmd $dir/log/add_first_layer.log \ + nnet3-init --srand=-3 $dir/init.raw $dir/configs/layer1.config $dir/0.raw || exit 1; + + # Convert to .mdl, train the transitions, set the priors. + $cmd $dir/log/init_mdl.log \ + nnet3-am-init $alidir/final.mdl $dir/0.raw - \| \ + nnet3-am-train-transitions - "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl || exit 1; +fi + + +# set num_iters so that as close as possible, we process the data $num_epochs +# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded, +# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. + +num_archives_to_process=$[$num_epochs*$num_archives_expanded] +num_archives_processed=0 +num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)] + +! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \ + && echo "$0: Insufficient epochs" && exit 1 + +finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period] + +echo "$0: Will train for $num_epochs epochs = $num_iters iterations" + +if $use_gpu; then + parallel_suffix="" + train_queue_opt="--gpu 1" + combine_queue_opt="--gpu 1" + prior_gpu_opt="--use-gpu=yes" + prior_queue_opt="--gpu 1" + parallel_train_opts= + if ! cuda-compiled; then + echo "$0: WARNING: you are running with one thread but you have not compiled" + echo " for CUDA. You may be running a setup optimized for GPUs. If you have" + echo " GPUs and have nvcc installed, go to src/ and do ./configure; make" + exit 1 + fi +else + echo "$0: without using a GPU this will be very slow. nnet3 does not yet support multiple threads." + parallel_train_opts="--use-gpu=no" + combine_queue_opt="" # the combine stage will be quite slow if not using + # GPU, as we didn't enable that program to use + # multiple threads. + prior_gpu_opt="--use-gpu=no" + prior_queue_opt="" +fi + + +approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final] +# First work out how many iterations we want to combine over in the final +# nnet3-combine-fast invocation. (We may end up subsampling from these if the +# number exceeds max_model_combine). The number we use is: +# min(max(max_models_combine, approx_iters_per_epoch_final), +# 1/2 * iters_after_last_layer_added) +num_iters_combine=$max_models_combine +if [ $num_iters_combine -lt $approx_iters_per_epoch_final ]; then + num_iters_combine=$approx_iters_per_epoch_final +fi +half_iters_after_add_layers=$[($num_iters-$finish_add_layers_iter)/2] +if [ $num_iters_combine -gt $half_iters_after_add_layers ]; then + num_iters_combine=$half_iters_after_add_layers +fi +first_model_combine=$[$num_iters-$num_iters_combine+1] + +x=0 + +for realign_time in $realign_times; do + # Work out the iterations on which we will re-align, if the --realign-times + # option was used. This is slightly approximate. + ! perl -e "exit($realign_time > 0.0 && $realign_time < 1.0 ? 0:1);" && \ + echo "Invalid --realign-times option $realign_times: elements must be strictly between 0 and 1."; + # the next formula is based on the one for mix_up_iter above. + realign_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters $realign_time) || exit 1; + realign_this_iter[$realign_iter]=$realign_time +done + +cur_egs_dir=$egs_dir + +while [ $x -lt $num_iters ]; do + [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0; + + this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);") + + ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process; + this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);"); + + echo "On iteration $x, learning rate is $this_learning_rate." + + if [ ! -z "${realign_this_iter[$x]}" ]; then + prev_egs_dir=$cur_egs_dir + cur_egs_dir=$dir/egs_${realign_this_iter[$x]} + fi + + if [ $x -ge 0 ] && [ $stage -le $x ]; then + if [ ! -z "${realign_this_iter[$x]}" ]; then + time=${realign_this_iter[$x]} + + echo "Getting average posterior for purposes of adjusting the priors." + # Note: this just uses CPUs, using a smallish subset of data. + # always use the first egs archive, which makes the script simpler; + # we're using different random subsets of it. + rm $dir/post.$x.*.vec 2>/dev/null + $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \ + nnet3-copy-egs --srand=JOB --frame=random $context_opts ark:$prev_egs_dir/egs.1.ark ark:- \| \ + nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \ + nnet3-merge-egs ark:- ark:- \| \ + nnet3-compute-from-egs --apply-exp=true "nnet3-am-copy --raw=true $dir/$x.mdl -|" ark:- ark:- \| \ + matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1; + + sleep 3; # make sure there is time for $dir/post.$x.*.vec to appear. + + $cmd $dir/log/vector_sum.$x.log \ + vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1; + rm $dir/post.$x.*.vec; + + echo "Re-adjusting priors based on computed posteriors" + $cmd $dir/log/adjust_priors.$x.log \ + nnet3-am-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1; + + sleep 2 + + steps/nnet3/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \ + --transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \ + --iter $x $data $lang $dir $dir/ali_$time || exit 1 + + steps/nnet3/relabel_egs.sh --cmd "$cmd" --iter $x $dir/ali_$time \ + $prev_egs_dir $cur_egs_dir || exit 1 + + if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then + steps/nnet3/remove_egs.sh $prev_egs_dir + fi + fi + + # Set off jobs doing some diagnostics, in the background. + # Use the egs dir from the previous iteration for the diagnostics + $cmd $dir/log/compute_prob_valid.$x.log \ + nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \ + "ark:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" & + $cmd $dir/log/compute_prob_train.$x.log \ + nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \ + "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" & + + if [ $x -gt 0 ]; then + $cmd $dir/log/progress.$x.log \ + nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \ + "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:-|" '&&' \ + nnet3-info "nnet3-am-copy --raw=true $dir/$x.mdl - |" & + fi + + echo "Training neural net (pass $x)" + + if [ $x -gt 0 ] && \ + [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \ + [ $[$x%$add_layers_period] -eq 0 ]; then + do_average=false # if we've just mixed up, don't do averaging but take the + # best. + cur_num_hidden_layers=$[1+$x/$add_layers_period] + config=$dir/configs/layer$cur_num_hidden_layers.config + raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl - | nnet3-init --srand=$x - $config - |" + else + do_average=true + if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average. + raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl -|" + fi + if $do_average; then + this_minibatch_size=$minibatch_size + else + # on iteration zero or when we just added a layer, use a smaller minibatch + # size (and we will later choose the output of just one of the jobs): the + # model-averaging isn't always helpful when the model is changing too fast + # (i.e. it can worsen the objective function), and the smaller minibatch + # size will help to keep the update stable. + this_minibatch_size=$[$minibatch_size/2]; + fi + + rm $dir/.error 2>/dev/null + + + ( # this sub-shell is so that when we "wait" below, + # we only wait for the training jobs that we just spawned, + # not the diagnostic jobs that we spawned above. + + # We can't easily use a single parallel SGE job to do the main training, + # because the computation of which archive and which --frame option + # to use for each job is a little complex, so we spawn each one separately. + for n in $(seq $this_num_jobs); do + k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive + # the other indexes from. + archive=$[($k%$num_archives)+1]; # work out the 1-based archive index. + frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame + # index; this increases more slowly than the archive index because the + # same archive with different frame indexes will give similar gradients, + # so we want to separate them in time. + + $cmd $train_queue_opt $dir/log/train.$x.$n.log \ + nnet3-train $parallel_train_opts \ + --max-param-change=$max_param_change "$raw" \ + "ark:nnet3-copy-egs --frame=$frame $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \ + $dir/$[$x+1].$n.raw || touch $dir/.error & + done + wait + ) + # the error message below is not that informative, but $cmd will + # have printed a more specific one. + [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1; + + nnets_list= + for n in `seq 1 $this_num_jobs`; do + nnets_list="$nnets_list $dir/$[$x+1].$n.raw" + done + + if $do_average; then + # average the output of the different jobs. + $cmd $dir/log/average.$x.log \ + nnet3-average $nnets_list - \| \ + nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1; + else + # choose the best from the different jobs. + n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) { + $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn"; + undef $logprob; while () { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } } + close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; + $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1; + [ -z "$n" ] && echo "Error getting best model" && exit 1; + $cmd $dir/log/select.$x.log \ + nnet3-am-copy --set-raw-nnet=$dir/$[$x+1].$n.raw $dir/$x.mdl $dir/$[$x+1].mdl || exit 1; + fi + + rm $nnets_list + [ ! -f $dir/$[$x+1].mdl ] && exit 1; + if [ -f $dir/$[$x-1].mdl ] && $cleanup && \ + [ $[($x-1)%100] -ne 0 ] && [ $[$x-1] -lt $first_model_combine ]; then + rm $dir/$[$x-1].mdl + fi + fi + x=$[$x+1] + num_archives_processed=$[$num_archives_processed+$this_num_jobs] +done + + +if [ $stage -le $num_iters ]; then + echo "Doing final combination to produce final.mdl" + + # Now do combination. In the nnet3 setup, the logic + # for doing averaging of subsets of the models in the case where + # there are too many models to reliably esetimate interpolation + # factors (max_models_combine) is moved into the nnet3-combine + nnets_list=() + for n in $(seq 0 $[num_iters_combine-1]); do + iter=$[$first_model_combine+$n] + mdl=$dir/$iter.mdl + [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1; + nnets_list[$n]="nnet3-am-copy --raw=true $mdl -|"; + done + + # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU, + # as if there are many models it can give out-of-memory error; and we set + # num-threads to 8 to speed it up (this isn't ideal...) + + $cmd $combine_queue_opt $dir/log/combine.log \ + nnet3-combine --num-iters=40 \ + --enforce-sum-to-one=true --enforce-positive-weights=true \ + --verbose=3 "${nnets_list[@]}" "ark:nnet3-merge-egs --minibatch-size=1024 ark:$cur_egs_dir/combine.egs ark:-|" \ + "|nnet3-am-copy --set-raw-nnet=- $dir/$num_iters.mdl $dir/combined.mdl" || exit 1; + + # Compute the probability of the final, combined model with + # the same subset we used for the previous compute_probs, as the + # different subsets will lead to different probs. + $cmd $dir/log/compute_prob_valid.final.log \ + nnet3-compute-prob "nnet3-am-copy --raw=true $dir/combined.mdl -|" \ + "ark:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" & + $cmd $dir/log/compute_prob_train.final.log \ + nnet3-compute-prob "nnet3-am-copy --raw=true $dir/combined.mdl -|" \ + "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" & +fi + +if [ $stage -le $[$num_iters+1] ]; then + echo "Getting average posterior for purposes of adjusting the priors." + # Note: this just uses CPUs, using a smallish subset of data. + if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1; + else egs_part=JOB; fi + rm $dir/post.$x.*.vec 2>/dev/null + $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$x.JOB.log \ + nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$cur_egs_dir/egs.$egs_part.ark ark:- \| \ + nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \ + nnet3-merge-egs ark:- ark:- \| \ + nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \ + "nnet3-am-copy --raw=true $dir/combined.mdl -|" ark:- ark:- \| \ + matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1; + + sleep 3; # make sure there is time for $dir/post.$x.*.vec to appear. + + $cmd $dir/log/vector_sum.$x.log \ + vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1; + + rm $dir/post.$x.*.vec; + + echo "Re-adjusting priors based on computed posteriors" + $cmd $dir/log/adjust_priors.final.log \ + nnet3-am-adjust-priors $dir/combined.mdl $dir/post.$x.vec $dir/final.mdl || exit 1; +fi + + +if [ ! -f $dir/final.mdl ]; then + echo "$0: $dir/final.mdl does not exist." + # we don't want to clean up if the training didn't succeed. + exit 1; +fi + +sleep 2 + +echo Done + +if $cleanup; then + echo Cleaning up data + if $remove_egs && [[ $cur_egs_dir =~ $dir/egs* ]]; then + steps/nnet2/remove_egs.sh $cur_egs_dir + fi + + echo Removing most of the models + for x in `seq 0 $num_iters`; do + if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then + # delete all but every 100th model; don't delete the ones which combine to form the final model. + rm $dir/$x.mdl + fi + done +fi