diff --git a/egs/fisher_english/s5/local/chain/compare_wer_general.py b/egs/fisher_english/s5/local/chain/compare_wer_general.py new file mode 100755 index 00000000000..e3a2dc5417a --- /dev/null +++ b/egs/fisher_english/s5/local/chain/compare_wer_general.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python + +import argparse +import collections +import os +import re +import sys + +sys.path.insert(0, 'steps') +import libs.common as common_lib + +from collections import defaultdict + +def get_args(): + parser = argparse.ArgumentParser( + description=""" +This script is used for comparing decoding results between systems. +e.g. local/chain/compare_wer_general.py exp/chain_cleaned/tdnn_{c,d}_sp +For use with discriminatively trained systems you specify the epochs after a colon: +for instance, +local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_c_sp exp/chain_cleaned/tdnn_c_sp_smbr:{1,2,3} +""") + + parser.add_argument("--separator", type=str, default=" ", + help="Separator for different fields") + parser.add_argument("--print-fine-details", action='store_true', + help="Add fine details of insertions, substitutions " + "and deletions.") + parser.add_argument("--include-looped", action='store_true', + help="Used to include looped results") + parser.add_argument("--field-size", type=int, + help="Field size for the models") + parser.add_argument("systems", nargs='+') + + args = parser.parse_args() + return args + + +def parse_system_string(system_string): + parts = system_string.split(":") + if len(parts) not in [1, 2, 3]: + raise RuntimeError("Unable to parse system string {0}" + "".format(system_string)) + + dir_name = parts[0] + + suffix = "" + if len(parts) > 1: + suffix = parts[1] + + model_name = os.path.basename(dir_name) + if len(parts) > 2: + model_name = parts[2] + + return (dir_name, suffix, model_name) + + +class SystemInfo(object): + def __init__(self, dir_name, suffix, model_name): + self.dir_name = dir_name + self.suffix = suffix + self.model_name = model_name + self.iter_ = "final" + + if self.suffix != "": + m = re.search("_iter(\d+)", suffix) + if bool(m): + self.iter_ = m.group(1) + else: + used_epochs = False + + self.probs = [] + self.wers = defaultdict(lambda: "NA") + self.ins = defaultdict(lambda: "NA") + self.dels = defaultdict(lambda: "NA") + self.sub = defaultdict(lambda: "NA") + + def add_wer(self, dev_set, affix=""): + decode_name = dev_set + self.suffix + + out = common_lib.get_command_stdout( + "grep WER {dir_name}/decode{affix}_{decode_name}/wer* | utils/best_wer.sh" + "".format(dir_name=self.dir_name, affix=affix, + decode_name=decode_name), + require_zero_status=False) + + if out != "" and len(out.split()) >= 2: + self.wers[(dev_set, affix)] = out.split()[1] + self.ins[(dev_set, affix)] = out.split()[6] + self.dels[(dev_set, affix)] = out.split()[8] + self.sub[(dev_set, affix)] = out.split()[10] + + def _get_prob(self, set_="train", xent=False): + + if not os.path.exists( + "{dir_name}/log/compute_prob_{set}.{iter}.log" + "".format(dir_name=self.dir_name, set=set_, iter=self.iter_)): + return "NA" + + out = common_lib.get_command_stdout( + "grep Overall {dir_name}/log/compute_prob_{set}.{iter}.log | " + "grep {opt} xent".format(dir_name=self.dir_name, set=set_, + iter=self.iter_, + opt="-w" if xent else "-v"), + require_zero_status=False) + + if out == "": + return "NA" + + lines = out.split("\n") + prob = None + + affix = "-xent" if xent else "" + for line in lines: + if (bool(re.search(r"'output-0{0}'".format(affix), line)) + or bool(re.search(r"'output{0}'".format(affix), line))): + prob = float(line.split()[7]) + break + + return "NA" if prob is None else "{0:.4f}".format(prob) + + def add_probs(self): + self.probs.append(self._get_prob(set_="train", xent=False)) + self.probs.append(self._get_prob(set_="valid", xent=False)) + self.probs.append(self._get_prob(set_="train", xent=True)) + self.probs.append(self._get_prob(set_="valid", xent=True)) + + +def run(args): + used_epochs = False + systems = [] + for sys_string in args.systems: + dir_name, suffix, model_name = parse_system_string(sys_string) + info = SystemInfo(dir_name, suffix, model_name) + + if suffix != "" and re.search(suffix, "epoch"): + used_epochs = True + else: + used_epochs = False + + for dev_set in ["dev", "test"]: + info.add_wer(dev_set) + + if args.include_looped: + info.add_wer(dev_set, affix="_looped") + + if not used_epochs: + info.add_probs() + + systems.append(info) + + print_system_infos(args, systems, used_epochs) + + +def print_system_infos(args, system_infos, used_epochs=False): + field_sizes = [args.field_size] * len(system_infos) + + if args.field_size is None: + for i, x in enumerate(system_infos): + field_sizes[i] = len(x.model_name) + + separator = args.separator + print ("# {0: <25}{sep}{1}".format( + "System", + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.model_name, field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + + tups = set() + for sys_info in system_infos: + for tup in sys_info.wers: + tups.add(tup) + + for tup in sorted(list(tups)): + dev_set, affix = tup + print ("# {0: <25}{sep}{1}".format( + "WER on {0} {1}" + "".format(dev_set, "[ "+affix+" ]" if affix != "" else ""), + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.wers[tup], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + if args.print_fine_details: + print ("# {0: <25}{sep}{1}".format( + "#Ins on {0} {1}" + "".format(dev_set, "[ "+affix+" ]" if affix != "" else ""), + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.ins[tup], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + print ("# {0: <25}{sep}{1}".format( + "#Del on {0} {1}" + "".format(dev_set, "[ "+affix+" ]" if affix != "" else ""), + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.dels[tup], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + print ("# {0: <25}{sep}{1}".format( + "#Sub on {0} {1}" + "".format(dev_set, "[ "+affix+" ]" if affix != "" else ""), + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.sub[tup], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + + if not used_epochs: + print ("# {0: <25}{sep}{1}".format( + "Final train prob", + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.probs[0], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + + print ("# {0: <25}{sep}{1}".format( + "Final valid prob", + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.probs[1], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + + print ("# {0: <25}{sep}{1}".format( + "Final train prob (xent)", + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.probs[2], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + + print ("# {0: <25}{sep}{1}".format( + "Final valid prob (xent)", + "{sep}".format(sep=args.separator).join( + ["{0: <{1}}".format(x.probs[3], field_sizes[i]) + for i, x in enumerate(system_infos)]), + sep=args.separator)) + + +if __name__ == "__main__": + args = get_args() + run(args) diff --git a/egs/fisher_english/s5/local/chain/confidence_calibration.sh b/egs/fisher_english/s5/local/chain/confidence_calibration.sh new file mode 100755 index 00000000000..34a487085aa --- /dev/null +++ b/egs/fisher_english/s5/local/chain/confidence_calibration.sh @@ -0,0 +1,88 @@ +#!/bin/bash +. cmd.sh +. path.sh + +chaindir=exp/chain_semi350k_conf/tdnn_xxsup1a_sp +arpa_gz=data/local/lm_ex250k/3gram-mincount/lm_unpruned.gz +graph_affix=_ex250k +decode_affix= +train_set=train_sup_5k_calib_train +dev_set=dev_sup_5k_calib_dev + +. utils/parse_options.sh + +set -euxo pipefail + +train_data=data/${train_set}_hires +dev_data=data/${dev_set}_hires + +decode_affix=${decode_affix}${graph_affix} +graphdir=$chaindir/graph${graph_affix} +train_caldir=$chaindir/decode_${train_set}${decode_affix}/confidence +dev_caldir=$chaindir/decode_${dev_set}${decode_affix}/confidence + +###### Data preparation, + +# Prepare filtering for excluding data from train-set (1 .. keep word, 0 .. exclude word), +# - only excludes from training-targets, the confidences are recalibrated for all the words, +word_filter=$(mktemp) +awk '{ keep_the_word = $1 !~ /^(\[.*\]|<.*>|%.*|!.*|-.*|.*-)$/; print $0, keep_the_word }' \ + $graphdir/words.txt >$word_filter + +# Calcualte the word-length, +word_length=$(mktemp) +awk '{if(r==0) { len_hash[$1] = NF-2; } + if(r==1) { if(len_hash[$1]) { len = len_hash[$1]; } else { len = -1 } + print $0, len; }}' \ + r=0 $graphdir/phones/align_lexicon.txt \ + r=1 $graphdir/words.txt \ + >$word_length + +# Extract unigrams, +unigrams=$(mktemp); steps/conf/parse_arpa_unigrams.py $graphdir/words.txt $arpa_gz $unigrams + +###### Paste the 'word-specific' features (first 4 columns have fixed position, more feature-columns can be added), +# Format: "word word_id filter length other_features" +word_feats=$(mktemp) +paste $word_filter <(awk '{ print $3 }' $word_length) <(awk '{ print $3 }' $unigrams) > $word_feats + + +###### Train the calibration, +steps/conf/train_calibration.sh --cmd "$decode_cmd" --lmwt 10 \ + $train_data $graphdir $word_feats \ + $chaindir/decode_${train_set}${decode_affix} $train_caldir + +###### Apply the calibration to eval set, +steps/conf/apply_calibration.sh --cmd "$decode_cmd" \ + $dev_data $graphdir $chaindir/decode_${dev_set}${decode_affix} \ + $train_caldir $dev_caldir +# The final confidences are here '$eval_caldir/ctm_calibrated', + +exit 0 + +###### Sclite scoring, +# We will produce NCE which shows the ``quality'' of the confidences. +# Please compare with the default scoring script for your database. + +# Scoring tools, +hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl +hubdir=`dirname $hubscr` + +# Inputs, +ctm=$eval_caldir/ctm_calibrated +stm=$eval_data/stm +glm=$eval_data/glm + +# Normalizng CTM, just like in 'local/score_sclite.sh', +cat $ctm | grep -i -v -E '\[NOISE|LAUGHTER|VOCALIZED-NOISE\]' | \ + grep -i -v -E '' | \ + grep -i -v -E ' (UH|UM|EH|MM|HM|AH|HUH|HA|ER|OOF|HEE|ACH|EEE|EW) ' | \ + awk '$5 !~ /^.*-$/' | \ + local/map_acronyms_ctm.py -M data/local/dict_nosp/acronyms.map -i - -o ${ctm}.filt + +# Mapping the time info to global, +utils/convert_ctm.pl $eval_data/segments $eval_data/reco2file_and_channel <${ctm}.filt >${ctm}.filt.conv + +# Scoring, +$hubscr -p $hubdir -V -l english -h hub5 -g $glm -r $stm ${ctm}.filt.conv + diff --git a/egs/fisher_english/s5/local/chain/run_semisupervised.sh b/egs/fisher_english/s5/local/chain/run_semisupervised.sh new file mode 100755 index 00000000000..77ae92e49b6 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/run_semisupervised.sh @@ -0,0 +1,143 @@ +#!/bin/bash + +set -e -o pipefail + +stage=-2 +nj=30 +decode_nj=30 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k # affix relating train-set splitting proportion + +tdnn_affix=_sup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# combination options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +comb_affix=_comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +unsup_egs_weight=1.0 +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +left_tolerance=2 +right_tolerance=2 +train_combined_opts="--num-epochs 4.5" +graph_affix= # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= +# to tune: +# frames_per_eg for unsupervised + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} + +if ! cuda-compiled; then + cat <$n1?$n2:$n1)) + num_archives=$[num_archives*3/2] + mkdir -p $comb_egs_dir/log + cp {$sup_egs_dir,$comb_egs_dir}/train_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/valid_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/combine.cegs + cp {$sup_egs_dir,$comb_egs_dir}/cmvn_opts + cp -r $sup_egs_dir/info $comb_egs_dir + echo $num_archives > $comb_egs_dir/info/num_archives + cat {$sup_egs_dir,$unsup_egs_dir}/info/num_frames | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/num_frames + cat {$sup_egs_dir,$unsup_egs_dir}/info/egs_per_archive | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/egs_per_archive + out_egs_list= + egs_list= + for n in $(seq $num_archives); do + [ -f $sup_egs_dir/cegs.$n.ark ] && egs_list="$egs_list $sup_egs_dir/cegs.$n.ark" + [ -f $unsup_egs_dir/cegs.$n.ark ] && egs_list="$egs_list $unsup_egs_dir/cegs.$n.ark" + out_egs_list="$out_egs_list ark:$comb_egs_dir/cegs.$n.ark" + done + srand=0 + $decode_cmd $comb_egs_dir/log/combine.log \ + nnet3-chain-copy-egs "ark:cat $egs_list|" $out_egs_list +fi + +if [ $stage -le 3 ]; then + echo "$0: training on the supervised+unsupervised subset" + # the train-set and gmm do not matter as we are providing the egs + local/chain/run_tdnn.sh --stage 12 --remove-egs false --train-set $supervised_set \ + --nnet3-affix $nnet3_affix \ + --tdnn-affix ${tdnn_affix}${decode_affix}${egs_affix}${comb_affix} \ + --common-egs-dir $comb_egs_dir $train_combined_opts +fi diff --git a/egs/fisher_english/s5/local/chain/run_tdnn.sh b/egs/fisher_english/s5/local/chain/run_tdnn.sh index aefe920ce11..47177f422bf 100755 --- a/egs/fisher_english/s5/local/chain/run_tdnn.sh +++ b/egs/fisher_english/s5/local/chain/run_tdnn.sh @@ -29,18 +29,12 @@ tree_affix= nnet3_affix= xent_regularize=0.1 hidden_dim=725 -num_leaves=11000 # training options num_epochs=4 remove_egs=false common_egs_dir= minibatch_size=128 -num_jobs_initial=3 -num_jobs_final=16 -initial_effective_lrate=0.001 -final_effective_lrate=0.0001 -frames_per_iter=1500000 gmm=tri5a build_tree_ali_dir=exp/tri4a_ali # used to make a new tree for chain topology, should match train data @@ -106,7 +100,7 @@ if [ $stage -le 11 ]; then # Build a tree using our new topology. steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ --leftmost-questions-truncate -1 \ - --cmd "$train_cmd" $num_leaves $build_tree_train_data_dir $lang $build_tree_ali_dir $treedir || exit 1; + --cmd "$train_cmd" 11000 $build_tree_train_data_dir $lang $build_tree_ali_dir $treedir || exit 1; fi if [ $stage -le 12 ]; then @@ -175,12 +169,12 @@ if [ $stage -le 13 ]; then --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width 150 \ --trainer.num-chunk-per-minibatch $minibatch_size \ - --trainer.frames-per-iter $frames_per_iter \ + --trainer.frames-per-iter 1500000 \ --trainer.num-epochs $num_epochs \ - --trainer.optimization.num-jobs-initial $num_jobs_initial \ - --trainer.optimization.num-jobs-final $num_jobs_final \ - --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ - --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.max-param-change 2.0 \ --cleanup.remove-egs $remove_egs \ --feat-dir $train_data_dir \ diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_b.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_b.sh new file mode 100644 index 00000000000..6254dd5d184 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_b.sh @@ -0,0 +1,198 @@ +#!/bin/bash +set -e + +# Based on run_tdnn_7b.sh in the fisher swbd recipe + +# configs for 'chain' +stage=0 +tdnn_affix=7b +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train +tree_affix= +nnet3_affix= +gmm=tri5a +xent_regularize=0.1 +hidden_dim=725 + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 11000 $build_tree_train_data_dir $lang $build_tree_ali_dir $treedir || exit 1; +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_a.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_a.sh new file mode 100755 index 00000000000..c5e0401c3e5 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_a.sh @@ -0,0 +1,145 @@ +#!/bin/bash + +set -e -o pipefail + +stage=-2 +nj=30 +decode_nj=30 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k # affix relating train-set splitting proportion + +tdnn_affix=_sup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# combination options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +comb_affix=_comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +unsup_egs_weight=1.0 +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +left_tolerance=2 +right_tolerance=2 +train_combined_opts="--num-epochs 4.5" +graph_affix= # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= +# to tune: +# frames_per_eg for unsupervised + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} + +if ! cuda-compiled; then + cat <$n1?$n2:$n1)) + num_archives=$[num_archives*3/2] + mkdir -p $comb_egs_dir/log + cp {$sup_egs_dir,$comb_egs_dir}/train_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/valid_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/combine.cegs + cp {$sup_egs_dir,$comb_egs_dir}/cmvn_opts + cp -r $sup_egs_dir/info $comb_egs_dir + echo $num_archives > $comb_egs_dir/info/num_archives + cat {$sup_egs_dir,$unsup_egs_dir}/info/num_frames | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/num_frames + cat {$sup_egs_dir,$unsup_egs_dir}/info/egs_per_archive | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/egs_per_archive + out_egs_list= + egs_list= + for n in $(seq $num_archives); do + [ -f $sup_egs_dir/cegs.$n.ark ] && egs_list="$egs_list $sup_egs_dir/cegs.$n.ark" + [ -f $unsup_egs_dir/cegs.$n.ark ] && egs_list="$egs_list $unsup_egs_dir/cegs.$n.ark" + out_egs_list="$out_egs_list ark:$comb_egs_dir/cegs.$n.ark" + done + srand=0 + $decode_cmd $comb_egs_dir/log/combine.log \ + nnet3-chain-copy-egs "ark:cat $egs_list|" $out_egs_list +fi + +if [ $stage -le 3 ]; then + echo "$0: training on the supervised+unsupervised subset" + # the train-set and gmm do not matter as we are providing the egs + local/chain/run_tdnn.sh --stage 12 --remove-egs false --train-set $supervised_set \ + --nnet3-affix $nnet3_affix \ + --tdnn-affix ${tdnn_affix}${decode_affix}${egs_affix}${comb_affix} \ + --common-egs-dir $comb_egs_dir $train_combined_opts +fi diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_b.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_b.sh new file mode 100755 index 00000000000..0c12140c8c7 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_b.sh @@ -0,0 +1,253 @@ +#!/bin/bash + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=30 +decode_nj=30 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix=_ex250k +egs_affix=_prun2_lmwt0_tol2 # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +left_tolerance=2 +right_tolerance=2 +graph_affix= # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +common_egs_dir= + +# Semi-supervised options +comb_affix=_comb1b2 # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +tree_affix= +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 + +decode_iter= +# to tune: +# frames_per_eg for unsupervised + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $stage -le 13 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_sp_hires \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z "$decode_iter" ]; then + iter_opts=" --iter $decode_iter " + else + decode_iter=final + fi + + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output;" $dir/$decode_iter.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/$decode_iter.mdl $dir/${decode_iter}-output.mdl + + iter_opts=" --iter ${decode_iter}-output " + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_a.sh new file mode 100644 index 00000000000..4a0b5f1dd26 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_a.sh @@ -0,0 +1,331 @@ +#!/bin/bash + +# This script is the baseline with unsupervised egs in multilingual recipe. +# lattice_lm_scale=0.0 +# lattice_prune_beam=2.0 +# tolerance=2 +# unsup_frames_per_eg=150 +# Deriv weights: None +# Unsupervised weight: 0.5 +# Unsupervised weight for phone LM: 0 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 + +tree_affix= +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $conf_dir/weights.scp +fi + +left_context=`cat $chaindir/egs/info/left_context` +right_context=`cat $chaindir/egs/info/right_context` +left_context_initial=`cat $chaindir/egs/info/left_context_initial` +right_context_final=`cat $chaindir/egs/info/right_context_final` + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=`cat $chaindir/egs/info/frames_per_eg` +frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +cmvn_opts=`cat $chaindir/cmvn_opts` + +unsup_egs_dir=$chaindir/unsup_egs${decode_affix}${egs_affix} + +if [ $stage -le 9 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $conf_dir/weights.scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + data/${unsupervised_set}_hires $chaindir \ + ${chaindir}/decode_${unsupervised_set}${decode_affix} $unsup_egs_dir +fi + +sup_egs_dir=$chaindir/egs_scp +comb_egs_dir=$chaindir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 10 ]; then + + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --samples-per-iter 10000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +treedir=exp/chain${nnet3_affix}/tree_${tree_affix} +lat_dir=exp/chain${nnet3_affix}/tri5a_${supervised_set}_sp_lats # not required since egs is given. +dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $stage -le 13 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_sp_hires \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_c.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_c.sh new file mode 100644 index 00000000000..0564bf693ab --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_c.sh @@ -0,0 +1,380 @@ +#!/bin/bash + +# This script is similar to _a but uses denominator FST created using +# LM estimated on supervised + unsupervised set phone sequences +# and deriv weights from calibrated confidences. +# lattice_lm_scale=0.0 +# lattice_prune_beam=2.0 +# tolerance=2 +# unsup_frames_per_eg=150 +# Deriv weights: None +# Unsupervised weight: 0.5 +# Unsupervised weight for phone LM: 2/3 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=_comb1c # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 + +tree_affix= +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $conf_dir/weights.scp +fi + +left_context=`cat $chaindir/egs/info/left_context` +right_context=`cat $chaindir/egs/info/right_context` +left_context_initial=`cat $chaindir/egs/info/left_context_initial` +right_context_final=`cat $chaindir/egs/info/right_context_final` + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=`cat $chaindir/egs/info/frames_per_eg` + +frame_subsampling_factor=1 +if [ -f $chaindir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +fi +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +treedir=exp/chain${nnet3_affix}/tree_${tree_affix} +dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix} + +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +if [ $stage -le 9 ]; then + false && $decode_cmd JOB=1:$(cat $unsup_lat_dir/num_jobs) \ + ${chaindir}/best_path_${unsupervised_set}${decode_affix}/log/get_best_path.JOB.log \ + lattice-best-path --acoustic-scale=1.0 \ + "ark:gunzip -c $unsup_lat_dir/lat.JOB.gz |" ark:/dev/null \ + "ark:| gzip -c > ${chaindir}/best_path_${unsupervised_set}${decode_affix}/ali.JOB.gz" + + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +supervised_set=${supervised_set}_sp +sup_lat_dir=exp/chain${nnet3_affix}/tri5a_${supervised_set}_lats +sup_egs_dir=$dir/egs_${supervised_set} + +if [ $stage -le 10 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $(cat $chaindir/egs/info/frames_per_eg) \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir +fi + +unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + +if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $conf_dir/weights.scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $chaindir \ + $unsup_lat_dir $unsup_egs_dir +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 12 ]; then + + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --samples-per-iter 10000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $stage -le 13 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -2 ]; then + train_stage=-2 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_d.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_d.sh new file mode 100644 index 00000000000..572a3f8466e --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_d.sh @@ -0,0 +1,298 @@ +#!/bin/bash + +# This script is similar to _a but uses deriv weights from lattice-posteriors. +# lattice_lm_scale=0.0 +# lattice_prune_beam=2.0 +# tolerance=2 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posteriors (Bug when originally run) +# Unsupervised weight: 0.5 +# Unsupervised weight for phone LM: 0 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1d # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 + +tree_affix= +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $stage -le 13 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_e.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_e.sh new file mode 100644 index 00000000000..24734d216e2 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_e.sh @@ -0,0 +1,396 @@ +#!/bin/bash + +# This script is similar to _c but re-creates supervised egs using new +# normalization FST. +# lattice_lm_scale=0.0 +# lattice_prune_beam=2.0 +# tolerance=2 +# unsup_frames_per_eg=150 +# Deriv weights: None +# Unsupervised weight: 0.5 +# Unsupervised weight for phone LM: 2/3 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +sup_egs_dir= +comb_affix=comb1e # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 + +tree_affix= +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $conf_dir/weights.scp +fi + +left_context=`cat $chaindir/egs/info/left_context` +right_context=`cat $chaindir/egs/info/right_context` +left_context_initial=`cat $chaindir/egs/info/left_context_initial` +right_context_final=`cat $chaindir/egs/info/right_context_final` + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=`cat $chaindir/egs/info/frames_per_eg` + +frame_subsampling_factor=1 +if [ -f $chaindir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +fi +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +treedir=exp/chain${nnet3_affix}/tree_${tree_affix} +dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +if [ $stage -le 9 ]; then + false && $decode_cmd JOB=1:$(cat $unsup_lat_dir/num_jobs) \ + ${chaindir}/best_path_${unsupervised_set}${decode_affix}/log/get_best_path.JOB.log \ + lattice-best-path --acoustic-scale=1.0 \ + "ark:gunzip -c $unsup_lat_dir/lat.JOB.gz |" ark:/dev/null \ + "ark:| gzip -c > ${chaindir}/best_path_${unsupervised_set}${decode_affix}/ali.JOB.gz" + + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +supervised_set=${supervised_set}_sp +sup_lat_dir=exp/chain${nnet3_affix}/tri5a_${supervised_set}_lats + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + + left_context=`cat $chaindir/egs/info/left_context` + right_context=`cat $chaindir/egs/info/right_context` + left_context_initial=`cat $chaindir/egs/info/left_context_initial` + right_context_final=`cat $chaindir/egs/info/right_context_final` + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 10 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + left_context=`cat $sup_egs_dir/info/left_context` + right_context=`cat $sup_egs_dir/info/right_context` + left_context_initial=`cat $sup_egs_dir/info/left_context_initial` + right_context_final=`cat $sup_egs_dir/info/right_context_final` + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg +unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + +if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $conf_dir/weights.scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $chaindir \ + $unsup_lat_dir $unsup_egs_dir +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 12 ]; then + + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --samples-per-iter 10000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $stage -le 13 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_f.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_f.sh new file mode 100644 index 00000000000..faef0c70546 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_f.sh @@ -0,0 +1,347 @@ +#!/bin/bash + +# This script is similar to _e but uses deriv weights from lattice-posteriors +# instead of from calibrated confidences. +# But there is a minor bug in creating the lattice posteriors when this +# script was run. An acwt of 1.0 was used for lattice-best-path when it +# should have been 0.1. +# lattice_lm_scale=0.0 +# lattice_prune_beam=2.0 +# tolerance=2 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posteriors (Bug when originally run) +# Unsupervised weight: 0.5 +# Unsupervised weight for phone LM: 2/3 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +sup_egs_dir= +comb_affix=comb1f # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 + +tree_affix= +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_g.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_g.sh new file mode 100644 index 00000000000..9dbca030174 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_g.sh @@ -0,0 +1,383 @@ +#!/bin/bash + +# This script is same as _e but uses a weight of 1.0 for unsupervised egs. +# lattice_lm_scale=0.0 +# lattice_prune_beam=2.0 +# tolerance=2 +# unsup_frames_per_eg=150 +# Deriv weights: None +# Unsupervised weight: 1.0 +# Unsupervised weight for phone LM: 2/3 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1g # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $conf_dir/weights.scp +fi + +frame_subsampling_factor=1 +if [ -f $chaindir/frame_subsampling_factor ]; then + frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +fi +cmvn_opts=`cat $chaindir/cmvn_opts` || exit 1 + +treedir=exp/chain${nnet3_affix}/tree_${tree_affix} +dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +if [ $stage -le 9 ]; then + $decode_cmd JOB=1:$(cat $unsup_lat_dir/num_jobs) \ + ${chaindir}/best_path_${unsupervised_set}${decode_affix}/log/get_best_path.JOB.log \ + lattice-best-path --acoustic-scale=1.0 \ + "ark:gunzip -c $unsup_lat_dir/lat.JOB.gz |" ark:/dev/null \ + "ark:| gzip -c > ${chaindir}/best_path_${unsupervised_set}${decode_affix}/ali.JOB.gz" + + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +supervised_set=${supervised_set}_sp +sup_lat_dir=exp/chain${nnet3_affix}/tri5a_${supervised_set}_lats + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + if [ $stage -le 10 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + left_context=`cat $sup_egs_dir/info/left_context` + right_context=`cat $sup_egs_dir/info/right_context` + left_context_initial=`cat $sup_egs_dir/info/left_context_initial` + right_context_final=`cat $sup_egs_dir/info/right_context_final` + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $conf_dir/weights.scp \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $chaindir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 12 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --samples-per-iter 10000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $stage -le 13 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_h.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_h.sh new file mode 100644 index 00000000000..866f310c0ed --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_h.sh @@ -0,0 +1,348 @@ +#!/bin/bash + +# This script is same as _g, but uses deriv weights from lattice posteriors +# instead of calibrated confidences. But there was a bug when running this +# script. (An acwt of 1.0 was used for lattice-best-path instead of 0.1) +# lattice_lm_scale=0.0 +# lattice_prune_beam=2.0 +# tolerance=2 +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posteriors (Bug when originally run) +# Unsupervised weight: 0.5 +# Unsupervised weight for phone LM: 2/3 + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1h # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_i.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_i.sh new file mode 100644 index 00000000000..69e29d600c9 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_i.sh @@ -0,0 +1,339 @@ +#!/bin/bash + +# This script is similar to _h, but uses unsup_frames_per_eg of 300. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=300 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1i # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_j.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_j.sh new file mode 100644 index 00000000000..6d98f9cf6da --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_j.sh @@ -0,0 +1,339 @@ +#!/bin/bash + +# This script is same as _k, but uses a weight of 0.5 for unsupervised egs. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=300 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1j # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_k.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_k.sh new file mode 100644 index 00000000000..96d101ac2f2 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_k.sh @@ -0,0 +1,339 @@ +#!/bin/bash + +# This script is same as _f, but uses an lm-scale of 0.1. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.1 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1k # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_l.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_l.sh new file mode 100644 index 00000000000..371bfcfc1b6 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_l.sh @@ -0,0 +1,339 @@ +#!/bin/bash + +# This script is same as _f, but uses an lm-scale of 0.5. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1l # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_m.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_m.sh new file mode 100644 index 00000000000..b608e77e309 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_m.sh @@ -0,0 +1,339 @@ +#!/bin/bash + +# This script is same as _f, but fixes the bug about acwt for best path. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1m # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_n.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_n.sh new file mode 100644 index 00000000000..b463ed56485 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_n.sh @@ -0,0 +1,339 @@ +#!/bin/bash + +# This script is same as _c, but redone to be consistent with _m. +# So it does not have any deriv weights. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1n # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=3,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_o.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_o.sh new file mode 100644 index 00000000000..b4e9e1e5faf --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_o.sh @@ -0,0 +1,341 @@ +#!/bin/bash + +# This script is same as _a, but re-done to be consistent with _m. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb350k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup +semi_affix=350k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1o # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.5 +lm_weights=1 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_p.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_p.sh new file mode 100644 index 00000000000..7137523c843 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_p.sh @@ -0,0 +1,340 @@ +#!/bin/bash + +# This script is same as _f, but fixes the bug about acwt for best path. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb270k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train +semi_affix=270k_conf # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a_20k # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1p # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.3 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_q.sh b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_q.sh new file mode 100644 index 00000000000..cf12901f617 --- /dev/null +++ b/egs/fisher_english/s5/local/chain/tuning/run_tdnn_semisupervised_conf_q.sh @@ -0,0 +1,340 @@ +#!/bin/bash + +# This script is same as _f, but fixes the bug about acwt for best path. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=train_comb270k # for reference + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup_20k +semi_affix=270k_conf_pca # affix relating train-set splitting proportion + +tdnn_affix=_xxsup1a_20k # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1q # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.3 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_semi${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${supervised_set}_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/fisher_create_test_lang.sh b/egs/fisher_english/s5/local/fisher_create_test_lang.sh index 1d7c4013b83..533a0949962 100755 --- a/egs/fisher_english/s5/local/fisher_create_test_lang.sh +++ b/egs/fisher_english/s5/local/fisher_create_test_lang.sh @@ -44,5 +44,7 @@ fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ fstisstochastic || echo "[log:] LG is not stochastic" +utils/build_const_arpa_lm.sh data/local/lm/4gram-mincount/lm_unpruned.gz \ + data/lang_test data/lang_test_fg echo "$0 succeeded" diff --git a/egs/fisher_english/s5/local/fisher_train_lms.sh b/egs/fisher_english/s5/local/fisher_train_lms.sh index 881d3ce9466..585680550f8 100755 --- a/egs/fisher_english/s5/local/fisher_train_lms.sh +++ b/egs/fisher_english/s5/local/fisher_train_lms.sh @@ -70,6 +70,8 @@ cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1] train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1; +train_lm.sh --arpa --lmtype 4gram-mincount $dir || exit 1; + # Perplexity over 88307.000000 words (excluding 691.000000 OOVs) is 71.241332 # note: output is diff --git a/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh b/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh index 6505381b03f..b1285de008f 100755 --- a/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh +++ b/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh @@ -6,8 +6,9 @@ stage=1 generate_alignments=true # false if doing chain training speed_perturb=true train_set=train - lda_train_set=train_100k +extractor= # ivector-extractor. + # If provided, will be used instead of training a new one. nnet3_affix= gmm=tri2_ali # should also contain alignments for $lda_train_set @@ -94,37 +95,42 @@ for line in sys.stdin.readlines(): steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir; utils/fix_data_dir.sh data/${dataset}_hires # remove segments with problems done - - # Take the first 30k utterances (about 1/8th of the data) this will be used - # for the diagubm training - utils/subset_data_dir.sh --first data/${train_set}_hires 30000 data/${train_set}_30k_hires - utils/data/remove_dup_utts.sh 200 data/${train_set}_30k_hires data/${train_set}_30k_nodup_hires # 33hr fi -# ivector extractor training -if [ $stage -le 4 ]; then - # We need to build a small system just because we need the LDA+MLLT transform - # to train the diag-UBM on top of. We use --num-iters 13 because after we get - # the transform (12th iter is the last), any further training is pointless. - # this decision is based on fisher_english - steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ - --splice-opts "--left-context=3 --right-context=3" \ - 5500 90000 data/${lda_train_set}_hires \ - data/lang $gmm_dir exp/nnet3${nnet3_affix}/tri3a -fi +if [ -z "$extractor" ]; then + if [ $stage -le 3 ]; then + # Take the first 30k utterances (about 1/8th of the data) this will be used + # for the diagubm training + utils/subset_data_dir.sh --first data/${train_set}_hires 30000 data/${train_set}_30k_hires + utils/data/remove_dup_utts.sh 200 data/${train_set}_30k_hires data/${train_set}_30k_nodup_hires # 33hr + fi -if [ $stage -le 5 ]; then - # To train a diagonal UBM we don't need very much data, so use the smallest subset. - steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \ - data/${train_set}_30k_nodup_hires 512 exp/nnet3${nnet3_affix}/tri3a exp/nnet3${nnet3_affix}/diag_ubm -fi + # ivector extractor training + if [ $stage -le 4 ]; then + # We need to build a small system just because we need the LDA+MLLT transform + # to train the diag-UBM on top of. We use --num-iters 13 because after we get + # the transform (12th iter is the last), any further training is pointless. + # this decision is based on fisher_english + steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + --splice-opts "--left-context=3 --right-context=3" \ + 5500 90000 data/${lda_train_set}_hires \ + data/lang $gmm_dir exp/nnet3${nnet3_affix}/tri3a + fi + + if [ $stage -le 5 ]; then + # To train a diagonal UBM we don't need very much data, so use the smallest subset. + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \ + data/${train_set}_30k_nodup_hires 512 exp/nnet3${nnet3_affix}/tri3a exp/nnet3${nnet3_affix}/diag_ubm + fi -if [ $stage -le 6 ]; then - # iVector extractors can be sensitive to the amount of data, but this one has a - # fairly small dim (defaults to 100) so we don't use all of it, we use just the - # 100k subset (just under half the data). - steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ - data/${lda_train_set}_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1; + if [ $stage -le 6 ]; then + # iVector extractors can be sensitive to the amount of data, but this one has a + # fairly small dim (defaults to 100) so we don't use all of it, we use just the + # 100k subset (just under half the data). + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/${lda_train_set}_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1; + fi + extractor=exp/nnet3${nnet3_affix}/extractor fi if [ $stage -le 7 ]; then @@ -136,11 +142,11 @@ if [ $stage -le 7 ]; then steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ - data/${train_set}_max2_hires exp/nnet3${nnet3_affix}/extractor exp/nnet3${nnet3_affix}/ivectors_${train_set}_hires || exit 1; + data/${train_set}_max2_hires $extractor `basename $extractor`/ivectors_${train_set}_hires || exit 1; for dataset in test dev; do steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ - data/${dataset}_hires exp/nnet3${nnet3_affix}/extractor exp/nnet3${nnet3_affix}/ivectors_${dataset}_hires || exit 1; + data/${dataset}_hires $extractor `basename $extractor`/ivectors_${dataset}_hires || exit 1; done fi diff --git a/egs/fisher_english/s5/local/nnet3/run_ivector_common_pca.sh b/egs/fisher_english/s5/local/nnet3/run_ivector_common_pca.sh new file mode 100755 index 00000000000..e159781e9a1 --- /dev/null +++ b/egs/fisher_english/s5/local/nnet3/run_ivector_common_pca.sh @@ -0,0 +1,114 @@ +#!/bin/bash + +. ./cmd.sh +set -e +stage=1 +speed_perturb=true +train_set=train +ivector_train_set= # data set for training i-vector extractor. + # If not provided, train_set will be used. + +nnet3_affix= +exp=exp + +. ./path.sh +. ./utils/parse_options.sh + +# perturbed data preparation +if [ "$speed_perturb" == "true" ]; then + if [ $stage -le 1 ]; then + # Although the nnet will be trained by high resolution data, we still have + # to perturb the normal data to get the alignments. + # _sp stands for speed-perturbed + + for datadir in ${train_set} ${ivector_train_set}; do + utils/data/perturb_data_dir_speed_3way.sh data/${datadir} data/${datadir}_sp + utils/fix_data_dir.sh data/${datadir}_sp + + mfccdir=mfcc_perturbed + steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \ + data/${datadir}_sp exp/make_mfcc/${datadir}_sp $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_sp exp/make_mfcc/${datadir}_sp $mfccdir || exit 1; + utils/fix_data_dir.sh data/${datadir}_sp + done + fi + train_set=${train_set}_sp + if ! [ -z "$ivector_train_set" ]; then + ivector_train_set=${ivector_train_set}_sp + fi +fi + +if [ $stage -le 3 ]; then + mfccdir=mfcc_hires + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + date=$(date +'%m_%d_%H_%M') + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english-$date/s5b/$mfccdir/storage $mfccdir/storage + fi + + for dataset in $ivector_train_set $train_set; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires + utils/data/perturb_data_dir_volume.sh data/${dataset}_hires + + steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir; + + # Remove the small number of utterances that couldn't be extracted for some + # reason (e.g. too short; no such file). + utils/fix_data_dir.sh data/${dataset}_hires; + done + + for dataset in test dev; do + # Create MFCCs for the eval set + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires + steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \ + data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires # remove segments with problems + done +fi + +if [ -z "$ivector_train_set" ]; then + ivector_train_set=$train_set +fi + +# ivector extractor training +if [ $stage -le 4 ]; then + steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" \ + --max-utts 10000 --subsample 2 \ + data/${ivector_train_set}_hires \ + $exp/nnet3${nnet3_affix}/pca_transform +fi + +if [ $stage -le 5 ]; then + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \ + data/${ivector_train_set}_hires 512 \ + $exp/nnet3${nnet3_affix}/pca_transform $exp/nnet3${nnet3_affix}/diag_ubm +fi + +if [ $stage -le 6 ]; then + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/${ivector_train_set}_hires $exp/nnet3${nnet3_affix}/diag_ubm $exp/nnet3${nnet3_affix}/extractor || exit 1; +fi + +if [ $stage -le 7 ]; then + # We extract iVectors on all the ${train_set} data, which will be what we + # train the system on. + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${ivector_train_set}_hires data/${ivector_train_set}_max2_hires + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${ivector_train_set}_max2_hires $exp/nnet3${nnet3_affix}/extractor $exp/nnet3${nnet3_affix}/ivectors_${ivector_train_set}_hires || exit 1; +fi + +if [ $stage -le 8 ]; then + for dataset in test dev; do + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${dataset}_hires $exp/nnet3${nnet3_affix}/extractor $exp/nnet3${nnet3_affix}/ivectors_${dataset}_hires || exit 1; + done +fi + +exit 0; + diff --git a/egs/fisher_english/s5/local/nnet3/run_tdnn.sh b/egs/fisher_english/s5/local/nnet3/run_tdnn.sh new file mode 100644 index 00000000000..f055b853b61 --- /dev/null +++ b/egs/fisher_english/s5/local/nnet3/run_tdnn.sh @@ -0,0 +1,98 @@ +#!/bin/bash + +# This script is not tested. + +# this is the standard "tdnn" system, built in nnet3; it's what we used to +# call multi-splice. + +. ./cmd.sh + + +# At this script level we don't support not running on GPU, as it would be painfully slow. +# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, +# --num-threads 16 and --minibatch-size 128. + +stage=0 +affix= +train_stage=-10 +common_egs_dir= +reporting_email= +remove_egs=true + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4000 data/${train_set} $lang $gmm_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn5 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_a.sh new file mode 100644 index 00000000000..60f64dee299 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_a.sh @@ -0,0 +1,359 @@ +#!/bin/bash + +# This script is same as _f, but fixes the bug about acwt for best path. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.3 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_b.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_b.sh new file mode 100644 index 00000000000..f106549167f --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_b.sh @@ -0,0 +1,358 @@ +#!/bin/bash + +# This script is same as _f, but fixes the bug about acwt for best path. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1b # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.3 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_c.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_c.sh new file mode 100644 index 00000000000..60f64dee299 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_c.sh @@ -0,0 +1,359 @@ +#!/bin/bash + +# This script is same as _f, but fixes the bug about acwt for best path. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.3 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix= + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_d.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_d.sh new file mode 100644 index 00000000000..780c783c87f --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_d.sh @@ -0,0 +1,374 @@ +#!/bin/bash + +# This script is same as _f, but fixes the bug about acwt for best path. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1d # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,0.3 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e.sh new file mode 100644 index 00000000000..9f2a2a8993b --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_e.sh @@ -0,0 +1,374 @@ +#!/bin/bash + +# This script is same as _f, but fixes the bug about acwt for best path. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1e # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_f.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_f.sh new file mode 100644 index 00000000000..346c5e6eede --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_f.sh @@ -0,0 +1,374 @@ +#!/bin/bash + +# This script is same as _e, but is run for 3 epochs instead of 4. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg= # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1f # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 3 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_g.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_g.sh new file mode 100644 index 00000000000..ccca9c6d334 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_11k_semisupervised_conf_g.sh @@ -0,0 +1,374 @@ +#!/bin/bash + +# This script is same as _e, but is run for 3 epochs instead of 4. + +set -u -e -o pipefail + +stage=-2 +train_stage=-100 +nj=40 +decode_nj=40 +base_train_set=semisup11k_250k # for reference +exp=exp/semisup_11k + +unsupervised_set=train_unsup250k # set this to your choice of unsupervised data +supervised_set=train_sup11k +semi_affix=semi11k_250k # affix relating train-set splitting proportion + +tdnn_affix=7b # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# Unsupervised options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +unsup_frames_per_eg=300 # if empty will be equal to the supervised model's config -- you will need to change minibatch_size for comb training accordingly +lattice_lm_scale=0.0 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam=2.0 # If supplied will prune the lattices prior to getting egs for unsupervised data +tolerance=2 +graph_affix=_ex250k # can be used to decode the unsup data with another lm/graph +phone_insertion_penalty= + +# Semi-supervised options +comb_affix=comb1g # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +supervision_weights=1.0,1.0 +lm_weights=5,2 +sup_egs_dir= +unsup_egs_dir= +tree_affix=fg + +extra_left_context=0 +extra_right_context=0 + +xent_regularize=0.1 +hidden_dim=725 +minibatch_size=128 +# to tune: +# frames_per_eg for unsupervised + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +nnet3_affix=_${semi_affix} # affix for nnet3 and chain dirs +decode_affix=${decode_affix}${graph_affix} +egs_affix=${egs_affix}_prun${lattice_prune_beam}_lmwt${lattice_lm_scale}_tol${tolerance} +tree_affix=${tree_affix}_${semi_affix} + +RANDOM=0 + +if ! cuda-compiled; then + cat < $chaindir/best_path_${unsupervised_set}${decode_affix}/frame_subsampling_factor + + steps/nnet3/chain/build_tree_multiple_sources.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --use-fmllr false \ + --cmd "$train_cmd" 10000 data/lang_chain \ + data/${supervised_set} $sup_ali_dir \ + data/${unsupervised_set} \ + $chaindir/best_path_${unsupervised_set}${decode_affix} \ + $treedir +fi + +dir=$exp/chain${nnet3_affix}/tdnn${tdnn_affix}${decode_affix}${egs_affix}${comb_affix:+_$comb_affix} + +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_den_fst.sh --weights $lm_weights --cmd "$train_cmd" \ + ${treedir} ${chaindir}/best_path_${unsupervised_set}${decode_affix} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output-1 input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-0-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + output-layer name=output-1-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + cp $dir/configs/final.config{,.orig} + + cat $dir/configs/final.config.orig | \ + perl -pe 's/component=output-1.affine/component=output-0.affine/g; + s/component=output-1-xent.affine/component=output-0-xent.affine/g;' > \ + $dir/configs/final.config +fi + +. $dir/configs/vars + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] +left_context_initial=$model_left_context +right_context_final=$model_right_context + +left_context=`perl -e "print int($left_context + $frame_subsampling_factor / 2)"` +right_context=`perl -e "print int($right_context + $frame_subsampling_factor / 2)"` +left_context_initial=`perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)"` +right_context_final=`perl -e "print int($right_context_final + $frame_subsampling_factor / 2)"` + +supervised_set=${supervised_set}_sp +sup_lat_dir=$exp/chain${nnet3_affix}/tri3_${supervised_set}_lats +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set} + frames_per_eg=$(cat $chaindir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 3 \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${supervised_set}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsupervised_set=${unsupervised_set}_sp +unsup_lat_dir=${chaindir}/decode_${unsupervised_set}${decode_affix} +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=$frames_per_eg + +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set}${decode_affix}${egs_affix} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --phone-insertion-penalty "$phone_insertion_penalty" \ + --deriv-weights-scp $chaindir/best_path_${unsupervised_set}${decode_affix}/weights.scp \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --generate-egs-scp true \ + data/${unsupervised_set}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/${comb_affix}_egs${decode_affix}${egs_affix}_multi + +if [ $stage -le 14 ]; then + steps/nnet3/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --minibatch-size 128 --frames-per-iter 1500000 \ + --lang2weight $supervision_weights --egs-prefix cegs. 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + + +if [ $train_stage -le -4 ]; then + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${base_train_set}_sp_hires \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch "150=128/300=64" \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 3 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set}_hires \ + --tree-dir $treedir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/${decode_iter}.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/${decode_iter}.mdl $dir/${decode_iter}-output.mdl || exit 1 + iter_opts=" --iter ${decode_iter}-output " + else + nnet3-copy --edits="remove-output-nodes name=output;rename-node old-name=output-0 new-name=output" $dir/final.mdl - | \ + nnet3-am-copy --set-raw-nnet=- $dir/final.mdl $dir/final-output.mdl || exit 1 + iter_opts=" --iter final-output " + fi + + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_iter$decode_iter} || exit 1; + ) & + done +fi +wait; +exit 0; + diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_oracle.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_oracle.sh new file mode 100755 index 00000000000..aa0e433c526 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_oracle.sh @@ -0,0 +1,198 @@ +#!/bin/bash +set -e + +# Based on run_tdnn_7b.sh in the fisher swbd recipe + +# configs for 'chain' +stage=0 +tdnn_affix=7b_oracle +train_stage=-10 +get_egs_stage=-10 +decode_iter= +train_set=train_sup11k +ivector_train_set=semisup11k_250k +tree_affix= +nnet3_affix=_semi11k_250k +chain_affix=_semi11k_250k +exp=exp/semisup_11k +gmm=tri3 +xent_regularize=0.1 +hidden_dim=725 + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= +minibatch_size=128 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +ali_dir=${gmm_dir}_ali_${train_set} +if [ $stage -le 11 ]; then + steps/align_fmllr.sh --cmd "$train_cmd" --nj 40 \ + data/${train_set} data/lang $gmm_dir $ali_dir || exit 1 + + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 11000 data/${train_set} $lang $ali_dir $treedir || exit 1 +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/run_10k.sh b/egs/fisher_english/s5/local/semisup/run_10k.sh new file mode 100644 index 00000000000..a5a293f3ce2 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/run_10k.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# Apache 2.0 + +. cmd.sh +. path.sh + +stage=-1 +train_stage=-10 + +. utils/parse_options.sh + +set -o pipefail +exp=exp/semisup_11k +false && { +utils/subset_data_dir.sh --speakers data/train_sup 11000 data/train_sup11k || exit 1 +utils/subset_data_dir.sh --shortest data/train_sup11k 5000 data/train_sup11k_short || exit 1 +utils/subset_data_dir.sh data/train_sup11k 5500 data/train_sup11k_half || exit 1 + +steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ + data/train_sup11k_short data/lang $exp/mono0a || exit 1 + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup11k_half data/lang $exp/mono0a $exp/mono0a_ali || exit 1 + +steps/train_deltas.sh --cmd "$train_cmd" \ + 2000 10000 data/train_sup11k_half data/lang $exp/mono0a_ali $exp/tri1 || exit 1 + +(utils/mkgraph.sh data/lang_test $exp/tri1 $exp/tri1/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri1/graph data/dev $exp/tri1/decode_dev)& + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup11k data/lang $exp/tri1 $exp/tri1_ali || exit 1; + +steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 2500 15000 data/train_sup11k data/lang $exp/tri1_ali $exp/tri2 || exit 1; + +(utils/mkgraph.sh data/lang_test $exp/tri2 $exp/tri2/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri2/graph data/dev $exp/tri2/decode_dev)& + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup11k data/lang $exp/tri2 $exp/tri2_ali || exit 1; + +steps/train_sat.sh --cmd "$train_cmd" \ + 2500 15000 data/train_sup11k data/lang $exp/tri2_ali $exp/tri3 || exit 1; + +( + utils/mkgraph.sh data/lang_test $exp/tri3 $exp/tri3/graph + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri3/graph data/dev $exp/tri3/decode_dev +)& + +utils/combine_data.sh data/semisup11k_250k data/train_sup11k data/train_unsup250k || exit 1 +} + +local/semisup/chain/tuning/run_tdnn_11k.sh \ + --ivector-train-set semisup11k_250k --train-set train_sup11k --stage $stage --train-stage $train_stage || exit 1 diff --git a/egs/fisher_english/s5/local/semisup/run_15k.sh b/egs/fisher_english/s5/local/semisup/run_15k.sh new file mode 100644 index 00000000000..7d5a2589a21 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/run_15k.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# Apache 2.0 + +. cmd.sh +. path.sh + +stage=-1 +train_stage=-10 + +. utils/parse_options.sh + +set -o pipefail +exp=exp/semisup_15k + +false && { +utils/subset_data_dir.sh --speakers data/train_sup 15000 data/train_sup15k || exit 1 +utils/subset_data_dir.sh --shortest data/train_sup15k 5000 data/train_sup15k_short || exit 1 +utils/subset_data_dir.sh data/train_sup15k 7500 data/train_sup15k_half || exit 1 + +steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ + data/train_sup15k_short data/lang $exp/mono0a || exit 1 + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup15k_half data/lang $exp/mono0a $exp/mono0a_ali || exit 1 + +steps/train_deltas.sh --cmd "$train_cmd" \ + 2000 10000 data/train_sup15k_half data/lang $exp/mono0a_ali $exp/tri1 || exit 1 + +(utils/mkgraph.sh data/lang_test $exp/tri1 $exp/tri1/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri1/graph data/dev $exp/tri1/decode_dev)& + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup15k data/lang $exp/tri1 $exp/tri1_ali || exit 1; + +steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 2500 15000 data/train_sup15k data/lang $exp/tri1_ali $exp/tri2 || exit 1; + +(utils/mkgraph.sh data/lang_test $exp/tri2 $exp/tri2/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri2/graph data/dev $exp/tri2/decode_dev)& + +steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup15k data/lang $exp/tri2 $exp/tri2_ali || exit 1; + +steps/train_sat.sh --cmd "$train_cmd" \ + 2500 15000 data/train_sup15k data/lang $exp/tri2_ali $exp/tri3 || exit 1; + +( + utils/mkgraph.sh data/lang_test $exp/tri3 $exp/tri3/graph + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp/tri3/graph data/dev $exp/tri3/decode_dev +)& + +utils/combine_data.sh data/semisup15k_250k data/train_sup15k data/train_unsup250k || exit 1 + +local/semisup/chain/tuning/run_tdnn_11k.sh \ + --train-set train_sup15k \ + --nnet3-affix _semi15k_250k \ + --chain-affix _semi15k_250k \ + --stage $stage --train-stage $train_stage \ + --exp $exp \ + --ivector-train-set semisup15k_250k || exit 1 +} + +local/semisup/chain/tuning/run_tdnn_oracle.sh \ + --train-set semisup15k_250k \ + --nnet3-affix _semi15k_250k \ + --chain-affix _semi15k_250k_oracle \ + --stage 9 --train-stage $train_stage \ + --exp $exp \ + --ivector-train-set semisup15k_250k || exit 1 diff --git a/egs/tedlium/s5_r2/local/chain/run_semisupervised.sh b/egs/tedlium/s5_r2/local/chain/run_semisupervised.sh new file mode 100755 index 00000000000..674b8745c42 --- /dev/null +++ b/egs/tedlium/s5_r2/local/chain/run_semisupervised.sh @@ -0,0 +1,181 @@ +#!/bin/bash + +set -e -o pipefail + +# e.g. try lm-scale: +# local/chain/run_semisupervised.sh --stage 1 --tdnn-affix _sup1a --egs-affix _lmwt1.0 --lattice-lm-scale 1.0 + + +# frames_per_eg 300 +# local/chain/run_semisupervised.sh --stage 1 --tdnn-affix _sup1d --unsup-frames-per-eg 300 --egs-affix _fpe300 + +stage=0 +nj=30 +decode_nj=30 +base_train_set=train_cleaned # the starting point train-set +base_gmm=tri3_cleaned # the starting point of training on the supervised data (no flat start for now) +semi_affix= # affix relating train-set splitting proportion + # (currently supervised 25%) and the base train set (currently _cleaned), etc. +tdnn_affix=_sup1a # affix for the supervised chain-model directory +train_supervised_opts="--stage -10 --train-stage -10" + +# combination options +decode_affix= +egs_affix= # affix for the egs that are generated from unsupervised data and for the comined egs dir +comb_affix=_comb1a # affix for new chain-model directory trained on the combined supervised+unsupervised subsets +unsup_frames_per_eg= # if empty will be equal to the supervised model's config +unsup_egs_weight=1.0 +lattice_lm_scale=0.1 # lm-scale for using the weights from unsupervised lattices +lattice_prune_beam= # If supplied will prune the lattices prior to getting egs for unsupervised data +left_tolerance=2 +right_tolerance=2 +train_combined_opts="--num-epochs 5" + +# to tune: +# frames_per_eg for unsupervised + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +supervised_set=${base_train_set}_sup${semi_affix} +unsupervised_set=${base_train_set}_unsup${semi_affix} +gmm=${base_gmm}_semi${semi_affix} # the gmm to be supplied to chain/run_tdnn.sh +nnet3_affix=_cleaned_semi${semi_affix} # affix for nnet3 and chain dirs + +if ! cuda-compiled; then + cat < data/$supervised_set/supervised_uttlist || true + utils/shuffle_list.pl data/$base_train_set/feats.scp | cut -d' ' -f1 | \ + tail -$num_unsupervised_utts > data/$supervised_set/unsupervised_uttlist || true + utils/subset_data_dir.sh --utt-list data/$supervised_set/supervised_uttlist \ + data/$base_train_set data/$supervised_set || exit 1 + utils/subset_data_dir.sh --utt-list data/$supervised_set/unsupervised_uttlist \ + data/$base_train_set data/$unsupervised_set || exit 1 + utils/data/subset_data_dir.sh --utt-list data/$unsupervised_set/feats.scp \ + data/${base_train_set}_sp_hires data/${unsupervised_set}_hires +fi + +if [ $stage -le -3 ]; then + # align the supervised subset with the current cleaned gmm + if [ -f $gmm/ali.1.gz ]; then + echo "$0: alignments in $gmm appear to already exist. Please either remove them " + echo " ... or use a later --stage option." + exit 1 + fi + echo "$0: aligning the supervised data data/${supervised_set}" + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + data/${supervised_set} data/lang exp/$base_gmm exp/$gmm +fi + +if [ $stage -le -2 ]; then + echo "$0: chain training on the supervised subset data/${supervised_set}" + local/chain/run_tdnn.sh $train_supervised_opts --remove-egs false \ + --train-set $supervised_set --gmm $gmm \ + --nnet3-affix $nnet3_affix --tdnn-affix $tdnn_affix +fi + +if [ $stage -le -1 ]; then + echo "$0: getting ivectors for the hires unsupervised data data/${unsupervised_set}_hires" + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "$nj" \ + data/${unsupervised_set}_hires exp/nnet3${nnet3_affix}/extractor \ + exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires +fi + +chaindir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi + +left_context=`cat $chaindir/egs/info/left_context` +right_context=`cat $chaindir/egs/info/right_context` +left_context_initial=`cat $chaindir/egs/info/left_context_initial` +right_context_final=`cat $chaindir/egs/info/right_context_final` +[ -z $unsup_frames_per_eg ] && unsup_frames_per_eg=`cat $chaindir/egs/info/frames_per_eg` +frame_subsampling_factor=`cat $chaindir/frame_subsampling_factor` +cmvn_opts=`cat $chaindir/cmvn_opts` + +if [ $stage -le 0 ]; then + echo "$0: getting the decoding lattices for the unsupervised subset using the chain model at: $chaindir" + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $chaindir/graph data/${unsupervised_set}_hires $chaindir/decode_${unsupervised_set}${decode_affix} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${unsupervised_set}_hires \ + ${chaindir}/decode_${unsupervised_set}${decode_affix} ${chaindir}/decode_${unsupervised_set}${decode_affix}_rescore + ln -s ../final.mdl $chaindir/decode_${unsupervised_set}${decode_affix}_rescore/final.mdl || true +fi + +if [ $stage -le 1 ]; then + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $left_tolerance --right-tolerance $right_tolerance \ + --left-context $left_context --right-context $right_context \ + --left-context-initial $left_context_initial --right-context-final $right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --egs-weight $unsup_egs_weight \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${unsupervised_set}_hires \ + data/${unsupervised_set}_hires $chaindir \ + ${chaindir}/decode_${unsupervised_set}${decode_affix}_rescore $chaindir/unsup_egs${decode_affix}${egs_affix} +fi + +sup_egs_dir=$chaindir/egs +unsup_egs_dir=$chaindir/unsup_egs${decode_affix}${egs_affix} +comb_egs_dir=$chaindir/comb_egs${decode_affix}${egs_affix} +if [ $stage -le 2 ]; then + echo "$0: combining supervised/unsupervised egs" + num_archives=`cat $chaindir/egs/info/num_archives` + mkdir -p $comb_egs_dir/log + cp {$sup_egs_dir,$comb_egs_dir}/train_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/valid_diagnostic.cegs + cp {$sup_egs_dir,$comb_egs_dir}/combine.cegs + cp {$sup_egs_dir,$comb_egs_dir}/cmvn_opts + cp -r $sup_egs_dir/info $comb_egs_dir + cat {$sup_egs_dir,$unsup_egs_dir}/info/num_frames | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/num_frames + cat {$sup_egs_dir,$unsup_egs_dir}/info/egs_per_archive | awk '{s+=$1} END{print s}' > $comb_egs_dir/info/egs_per_archive + out_egs_list= + egs_list= + for n in $(seq $num_archives); do + egs_list="$egs_list $sup_egs_dir/cegs.$n.ark" + egs_list="$egs_list $unsup_egs_dir/cegs.$n.ark" + out_egs_list="$out_egs_list ark:$comb_egs_dir/cegs.$n.ark" + done + srand=0 + $decode_cmd $comb_egs_dir/log/combine.log \ + nnet3-chain-copy-egs "ark:cat $egs_list|" $out_egs_list +fi + +if [ $stage -le 3 ]; then + echo "$0: training on the supervised+unsupervised subset" + # the train-set and gmm do not matter as we are providing the egs + local/chain/run_tdnn.sh --stage 17 --remove-egs false --train-set $supervised_set --gmm $gmm \ + --nnet3-affix $nnet3_affix \ + --tdnn-affix ${tdnn_affix}${decode_affix}${egs_affix}${comb_affix} \ + --common-egs-dir $comb_egs_dir $train_combined_opts +fi + diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh index 99921a9bf61..1c4a032fc57 100755 --- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh +++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh @@ -52,6 +52,7 @@ train_set=train_cleaned gmm=tri3_cleaned # the gmm for the target data num_threads_ubm=32 nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +num_epochs=4 # The rest are configs specific to this script. Most of the parameters # are just hardcoded at this level, in the commands below. @@ -59,6 +60,7 @@ train_stage=-10 tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. tdnn_affix=1d #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. common_egs_dir= # you can set this to use previously dumped egs. +remove_egs=true # End configuration section. echo "$0 $@" # Print the command line for logging @@ -212,13 +214,13 @@ if [ $stage -le 18 ]; then --egs.chunk-width 150 \ --trainer.num-chunk-per-minibatch 128 \ --trainer.frames-per-iter 1500000 \ - --trainer.num-epochs 4 \ + --trainer.num-epochs $num_epochs \ --trainer.optimization.num-jobs-initial 2 \ --trainer.optimization.num-jobs-final 12 \ --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.max-param-change 2.0 \ - --cleanup.remove-egs true \ + --cleanup.remove-egs $remove_egs \ --feat-dir $train_data_dir \ --tree-dir $tree_dir \ --lat-dir $lat_dir \ diff --git a/egs/wsj/s5/steps/best_path_weights.sh b/egs/wsj/s5/steps/best_path_weights.sh new file mode 100755 index 00000000000..c2e0c60f961 --- /dev/null +++ b/egs/wsj/s5/steps/best_path_weights.sh @@ -0,0 +1,160 @@ +#!/bin/bash + +# Copyright 2014-17 Vimal Manohar + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# This script combines frame-level posteriors from different decode +# directories. The first decode directory is assumed to be the primary +# and is used to get the best path. The posteriors from other decode +# directories are interpolated with the posteriors of the best path. +# The output is a new directory with final.mdl, tree from the primary +# decode-dir and the best path alignments and weights in a decode-directory +# with the same basename as the primary directory. +# This is typically used to get better posteriors for semisupervised training +# of DNN +# e.g. local/combine_posteriors.sh exp/tri6_nnet/decode_train_unt.seg +# exp/sgmm_mmi_b0.1/decode_fmllr_train_unt.seg_it4 exp/combine_dnn_sgmm +# Here the final.mdl and tree are copied from exp/tri6_nnet to +# exp/combine_dnn_sgmm. ali.*.gz obtained from the primary dir and +# the interpolated posteriors in weights.scp are placed in +# exp/combine_dnn_sgmm/decode_train_unt.seg + +set -e + +# begin configuration section. +cmd=run.pl +stage=-10 +acwt=0.1 +#end configuration section. + +help_message="Usage: "$(basename $0)" [options] [:weight] [:weight] [[:weight] ... ] + E.g. "$(basename $0)" data/train_unt.seg data/lang exp/tri1/decode:0.5 exp/tri2/decode:0.25 exp/tri3/decode:0.25 exp/combine +Options: + --cmd (run.pl|queue.pl...) # specify how to run the sub-processes. +"; + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -lt 4 ]; then + printf "$help_message\n"; + exit 1; +fi + +data=$1 +lang=$2 +dir=${@: -1} # last argument to the script +shift 2; +decode_dirs=( $@ ) # read the remaining arguments into an array +unset decode_dirs[${#decode_dirs[@]}-1] # 'pop' the last argument which is odir +num_sys=${#decode_dirs[@]} # number of systems to combine + +mkdir -p $dir +mkdir -p $dir/log + +decode_dir=`echo ${decode_dirs[0]} | cut -d: -f1` +nj=`cat $decode_dir/num_jobs` + +mkdir -p $dir + +if [ $stage -lt -1 ]; then + mkdir -p $dir/log + $cmd JOB=1:$nj $dir/log/best_path.JOB.log \ + lattice-best-path --acoustic-scale=$acwt \ + "ark,s,cs:gunzip -c $decode_dir/lat.JOB.gz |" \ + ark:/dev/null "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1 +fi + +src_dir=`dirname $decode_dir` + +cp $src_dir/cmvn_opts $dir/ || exit 1 +for f in final.mat splice_opts frame_subsampling_factor; do + [ -f $src_dir/$f ] && cp $src_dir/$f $dir +done + +weights_sum=0.0 + +for i in `seq 0 $[num_sys-1]`; do + decode_dir=${decode_dirs[$i]} + + weight=`echo $decode_dir | cut -d: -s -f2` + [ -z "$weight" ] && weight=1.0 + + if [ $i -eq 0 ]; then + file_list="\"ark:vector-scale --scale=$weight ark:$dir/weights.$i.JOB.ark ark:- |\"" + else + file_list="$file_list \"ark,s,cs:vector-scale --scale=$weight ark:$dir/weights.$i.JOB.ark ark:- |\"" + fi + + weights_sum=`perl -e "print STDOUT $weights_sum + $weight"` +done + +inv_weights_sum=`perl -e "print STDOUT 1.0/$weights_sum"` + +fdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}` + +for i in `seq 0 $[num_sys-1]`; do + if [ $stage -lt $i ]; then + decode_dir=`echo ${decode_dirs[$i]} | cut -d: -f1` + + model=`dirname $decode_dir`/final.mdl # model one level up from decode dir + tree=`dirname $decode_dir`/tree # tree one level up from decode dir + + for f in $model $decode_dir/lat.1.gz $tree; do + [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; + done + if [ $i -eq 0 ]; then + nj=`cat $decode_dir/num_jobs` || exit 1; + cp $model $dir || exit 1 + cp $tree $dir || exit 1 + echo $nj > $dir/num_jobs + else + if [ $nj != `cat $decode_dir/num_jobs` ]; then + echo "$0: number of decoding jobs mismatches, $nj versus `cat $decode_dir/num_jobs`" + exit 1; + fi + fi + + $cmd JOB=1:$nj $dir/log/get_post.$i.JOB.log \ + lattice-to-post --acoustic-scale=$acwt \ + "ark,s,cs:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \ + post-to-pdf-post $model ark,s,cs:- ark:- \| \ + get-post-on-ali ark,s,cs:- "ark,s,cs:gunzip -c $dir/ali.JOB.gz | convert-ali $dir/final.mdl $model $tree ark,s,cs:- ark:- | ali-to-pdf $model ark,s,cs:- ark:- |" "ark,scp:$fdir/weights.$i.JOB.ark,$fdir/weights.$i.JOB.scp" || exit 1 + fi +done + +if [ $stage -lt $num_sys ]; then + if [ "$num_sys" -eq 1 ]; then + for n in `seq $nj`; do + cat $dir/weights.0.$n.scp + done > $dir/weights.scp + else + $cmd JOB=1:$nj $dir/log/interpolate_post.JOB.log \ + vector-sum $file_list ark:- \| \ + vector-scale --scale=$inv_weights_sum ark:- \ + ark,scp:$fdir/weights.JOB.ark,$fdir/weights.JOB.scp || exit 1 + + for n in `seq $nj`; do + cat $dir/weights.$n.scp + done > $dir/weights.scp + fi +fi + +for n in `seq 1 $[num_sys-1]`; do + rm $dir/weights.$n.*.ark $dir/weights.$n.*.scp +done + +exit 0 diff --git a/egs/wsj/s5/steps/conf/apply_calibration.sh b/egs/wsj/s5/steps/conf/apply_calibration.sh index c1a22e274b8..48f9e17d30b 100755 --- a/egs/wsj/s5/steps/conf/apply_calibration.sh +++ b/egs/wsj/s5/steps/conf/apply_calibration.sh @@ -28,6 +28,7 @@ caldir=$4 dir=$5 model=$latdir/../final.mdl # assume model one level up from decoding dir. +model_dir=$latdir/.. calibration=$caldir/calibration.mdl word_feats=$caldir/word_feats word_categories=$caldir/word_categories @@ -49,6 +50,12 @@ cp $calibration $dir/calibration.mdl cp $word_feats $dir/word_feats cp $word_categories $dir/word_categories +frame_shift_opt= +if [ -f $model_dir/frame_subsampling_factor ]; then + frame_subsampling_factor=$(cat $model_dir/frame_subsampling_factor) + frame_shift_opt="--frame-shift=0.0$frame_subsampling_factor" +fi + # Create the ctm with raw confidences, # - we keep the timing relative to the utterance, if [ $stage -le 0 ]; then @@ -58,7 +65,7 @@ if [ $stage -le 0 ]; then lattice-push --push-strings=false ark:- ark:- \| \ lattice-align-words-lexicon --max-expand=10.0 \ $lang/phones/align_lexicon.int $model ark:- ark:- \| \ - lattice-to-ctm-conf --decode-mbr=$decode_mbr ark:- - \| \ + lattice-to-ctm-conf --decode-mbr=$decode_mbr $frame_shift_opt ark:- - \| \ utils/int2sym.pl -f 5 $lang/words.txt \ '>' $dir/JOB.ctm # Merge and clean, @@ -76,7 +83,7 @@ fi # Create the forwarding data for logistic regression, if [ $stage -le 2 ]; then steps/conf/prepare_calibration_data.py --conf-feats $dir/forward_feats.ark \ - --lattice-depth $latdepth $dir/ctm_int $word_feats $word_categories + --lattice-depth $latdepth $frame_shift_opt $dir/ctm_int $word_feats $word_categories fi # Apply calibration model to dev, diff --git a/egs/wsj/s5/steps/conf/convert_ctm_to_weights.py b/egs/wsj/s5/steps/conf/convert_ctm_to_weights.py new file mode 100755 index 00000000000..02a616b2c03 --- /dev/null +++ b/egs/wsj/s5/steps/conf/convert_ctm_to_weights.py @@ -0,0 +1,101 @@ +#! /usr/bin/env python + +import argparse +import logging +import sys + +sys.path.insert(0, 'steps') +import libs.common as common_lib + +logger = logging.getLogger('libs') +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - " + "%(funcName)s - %(levelname)s ] %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + + +def get_args(): + parser = argparse.ArgumentParser( + description="""This script converts CTM to per-frame weights by the word + posteriors in the CTM as the weights.""") + + parser.add_argument("--frame-shift", type=float, default=0.01, + help="Frame shift value in seconds") + parser.add_argument("--default-weight", type=float, default=1.0, + help="Default weight on silence frames") + parser.add_argument("segments_in", type=str, help="Input segments file") + parser.add_argument("ctm_in", type=str, help="Input utterance-level CTM " + "file i.e. the first column has utterance-ids") + parser.add_argument("weights_out", type=str, help="Output per-frame " + "weights vector written in Kaldi text archive format") + + args = parser.parse_args() + + return args + + +def run(args): + utt2num_frames = {} + with common_lib.smart_open(args.segments_in) as segments_reader: + for line in segments_reader.readlines(): + parts = line.strip().split() + if len(parts) not in [4, 5]: + raise RuntimeError("Invalid line {0} in segments file {1}" + "".format(line.strip(), args.segments_in)) + utt2num_frames[parts[0]] = int((float(parts[3]) - float(parts[2])) + / args.frame_shift + 0.5) + + num_utt = 0 + with common_lib.smart_open(args.ctm_in) as ctm_reader, \ + common_lib.smart_open(args.weights_out, 'w') as weights_writer: + prev_utt = None + weights = [] + for line in ctm_reader.readlines(): + parts = line.strip().split() + if len(parts) not in [5, 6]: + raise RuntimeError("Invalid line {0} in CTM file {1}" + "".format(line.strip(), args.ctm_in)) + + utt = parts[0] + if utt != prev_utt: + if prev_utt is not None: + assert len(weights) >= utt2num_frames[prev_utt] + common_lib.write_vector_ascii(weights_writer, weights, + key=prev_utt) + weights = [args.default_weight for x in + range(utt2num_frames[utt])] + + start_time = float(parts[2]) + dur = float(parts[3]) + prob = 1.0 if len(parts) == 5 else float(parts[5]) + + start_frame = int(start_time / args.frame_shift + 0.5) + length = int(dur / args.frame_shift) + + if len(weights) < start_frame + length: + weights.extend([args.default_weight for x in + range(len(weights), start_frame + length)]) + for x in range(start_frame, start_frame + length): + weights[x] = prob + + assert len(weights) >= start_frame + length + prev_utt = utt + num_utt += 1 + assert len(weights) >= utt2num_frames[prev_utt] + common_lib.write_vector_ascii(weights_writer, weights, + key=prev_utt) + + if num_utt == 0: + raise RuntimeError("Failed to process any utterances") + + +def main(): + args = get_args() + run(args) + + +if __name__ == "__main__": + main() diff --git a/egs/wsj/s5/steps/conf/prepare_calibration_data.py b/egs/wsj/s5/steps/conf/prepare_calibration_data.py index bc8f92a2f7f..753771b1d89 100755 --- a/egs/wsj/s5/steps/conf/prepare_calibration_data.py +++ b/egs/wsj/s5/steps/conf/prepare_calibration_data.py @@ -10,7 +10,7 @@ Prepare input features and training targets for logistic regression, which calibrates the Minimum Bayes Risk posterior confidences. -The logisitc-regression input features are: +The logisitc-regression input features are: - posteriors from 'ctm' transformed by logit, - logarithm of word-length in letters, - 10base logarithm of unigram probability of a word from language model, @@ -34,6 +34,8 @@ parser.add_option("--conf-targets", help="Targets file for logistic regression (no targets generated if '') [default %default]", default='') parser.add_option("--conf-feats", help="Feature file for logistic regression. [default %default]", default='') parser.add_option("--lattice-depth", help="Per-frame lattice depths, ascii-ark (optional). [default %default]", default='') +parser.add_option("--frame-shift", type=float, default=0.01, + help="Frame shift value in seconds [default %default]") (o, args) = parser.parse_args() if len(args) != 3: @@ -63,11 +65,11 @@ if o.conf_targets != '': with open(o.conf_targets,'w') as f: for (utt, chan, beg, dur, wrd_id, conf, score_tag) in ctm: - # Skip the words we don't know if being correct, - if score_tag == 'U': continue + # Skip the words we don't know if being correct, + if score_tag == 'U': continue # Some words are excluded from training (partial words, hesitations, etc.), # (Value: 1 == keep word, 0 == exclude word from the targets), - if not word_filter[wrd_id]: continue + if not word_filter[wrd_id]: continue # Build the key, key = "%s^%s^%s^%s^%s,%s,%s" % (utt, chan, beg, dur, wrd_id, conf, score_tag) # Build the target, @@ -102,7 +104,7 @@ # - log of word-length, log_word_length = math.log(word_length[wrd_id]) # i.e. number of phones in a word, # - categorical distribution of words (with frequency higher than min-count), - wrd_1_of_k = [0]*wrd_cat_num; + wrd_1_of_k = [0]*wrd_cat_num; wrd_1_of_k[wrd_to_cat[wrd_id]] = 1; # Compose the input feature vector, @@ -110,10 +112,10 @@ # Optionally add average-depth of lattice at the word position, if o.lattice_depth != '': - depth_slice = depths[utt][int(round(100.0*float(beg))):int(round(100.0*(float(beg)+float(dur))))] + depth_slice = depths[utt][int(float(beg) / o.frame_shift + 0.5):int((float(beg) + max(o.frame_shift, float(dur))) / o.frame_shift + 0.5)] log_avg_depth = math.log(float(sum(depth_slice))/len(depth_slice)) feats += [ log_avg_depth ] - # Store the input features, + # Store the input features, f.write(key + ' [ ' + ' '.join(map(str,feats)) + ' ]\n') diff --git a/egs/wsj/s5/steps/conf/train_calibration.sh b/egs/wsj/s5/steps/conf/train_calibration.sh index c2aca05056e..9a8451c9f85 100755 --- a/egs/wsj/s5/steps/conf/train_calibration.sh +++ b/egs/wsj/s5/steps/conf/train_calibration.sh @@ -12,7 +12,7 @@ # (- categorical distribution of 'lang/words.txt', DISABLED) # begin configuration section. -cmd= +cmd=run.pl lmwt=12 decode_mbr=true word_min_count=10 # Minimum word-count for single-word category, @@ -43,6 +43,7 @@ latdir=$4 dir=$5 model=$latdir/../final.mdl # assume model one level up from decoding dir. +model_dir=$latdir/.. for f in $data/text $lang/words.txt $word_feats $latdir/lat.1.gz; do [ ! -f $f ] && echo "$0: Missing file $f" && exit 1 @@ -57,6 +58,12 @@ echo $lmwt >$dir/lmwt echo $decode_mbr >$dir/decode_mbr cp $word_feats $dir/word_feats +frame_shift_opt= +if [ -f $model_dir/frame_subsampling_factor ]; then + frame_subsampling_factor=$(cat $model_dir/frame_subsampling_factor) + frame_shift_opt="--frame-shift=0.0$frame_subsampling_factor" +fi + # Create the ctm with raw confidences, # - we keep the timing relative to the utterance, if [ $stage -le 0 ]; then @@ -66,7 +73,7 @@ if [ $stage -le 0 ]; then lattice-push --push-strings=false ark:- ark:- \| \ lattice-align-words-lexicon --max-expand=10.0 \ $lang/phones/align_lexicon.int $model ark:- ark:- \| \ - lattice-to-ctm-conf --decode-mbr=$decode_mbr ark:- - \| \ + lattice-to-ctm-conf --decode-mbr=$decode_mbr $frame_shift_opt ark:- - \| \ utils/int2sym.pl -f 5 $lang/words.txt \ '>' $dir/JOB.ctm # Merge and clean, @@ -104,7 +111,7 @@ fi if [ $stage -le 3 ]; then steps/conf/prepare_calibration_data.py \ --conf-targets $dir/train_targets.ark --conf-feats $dir/train_feats.ark \ - --lattice-depth $latdepth $dir/ctm_aligned_int $word_feats $dir/word_categories + --lattice-depth $latdepth $frame_shift_opt $dir/ctm_aligned_int $word_feats $dir/word_categories fi # Train the logistic regression, diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py index 8727ccd1a5e..d147cd7ba86 100644 --- a/egs/wsj/s5/steps/libs/common.py +++ b/egs/wsj/s5/steps/libs/common.py @@ -358,6 +358,33 @@ def write_matrix_ascii(file_or_fd, mat, key=None): if fd is not file_or_fd : fd.close() +def write_vector_ascii(file_or_fd, vec, key=None): + """This function writes the vector 'vec' stored as a list + in kaldi vector text format. + The destination can be a file or an opened file descriptor. + If key is provided, then vector is written to an archive with the 'key' + as the index field. + """ + try: + fd = open(file_or_fd, 'w') + except TypeError: + # 'file_or_fd' is opened file descriptor, + fd = file_or_fd + + try: + if key is not None: + print ("{0} [".format(key), + file=fd, end=' ') # ark-files have keys (utterance-id) + else: + print (" [", file=fd, end=' ') + + line = ' '.join(["{0:f}".format(x) for x in vec]) + line += " ]" + print (line, file=fd) + finally: + if fd is not file_or_fd : fd.close() + + def read_matrix_ascii(file_or_fd): """This function reads a matrix in kaldi matrix text format and stores it as a list of lists. diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index fedce12dda0..32b320c495e 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -129,7 +129,8 @@ def train_new_models(dir, iter, srand, num_jobs, momentum, max_param_change, shuffle_buffer_size, num_chunk_per_minibatch_str, frame_subsampling_factor, run_opts, - backstitch_training_scale=0.0, backstitch_training_interval=1): + backstitch_training_scale=0.0, backstitch_training_interval=1, + use_multitask_egs=False): """ Called from train_one_iteration(), this method trains new models with 'num_jobs' jobs, and @@ -140,6 +141,12 @@ def train_new_models(dir, iter, srand, num_jobs, to use for each job is a little complex, so we spawn each one separately. this is no longer true for RNNs as we use do not use the --frame option but we use the same script for consistency with FF-DNN code + + use_multitask_egs : True, if different examples used to train multiple + tasks or outputs, e.g.multilingual training. + multilingual egs can be generated using get_egs.sh and + steps/nnet3/multilingual/allocate_multilingual_examples.py, + those are the top-level scripts. """ deriv_time_opts = [] @@ -167,6 +174,12 @@ def train_new_models(dir, iter, srand, num_jobs, frame_shift = ((archive_index + k/num_archives) % frame_subsampling_factor) + multitask_egs_opts = common_train_lib.get_multitask_egs_opts( + egs_dir, + egs_prefix="cegs.", + archive_index=archive_index, + use_multitask_egs=use_multitask_egs) + scp_or_ark = "scp" if use_multitask_egs else "ark" cache_io_opts = (("--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter) if iter > 0 else "") + @@ -186,9 +199,9 @@ def train_new_models(dir, iter, srand, num_jobs, --backstitch-training-interval={backstitch_training_interval} \ --srand={srand} \ "{raw_model}" {dir}/den.fst \ - "ark,bg:nnet3-chain-copy-egs \ + "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} \ --frame-shift={fr_shft} \ - ark:{egs_dir}/cegs.{archive_index}.ark ark:- | \ + {scp_or_ark}:{egs_dir}/cegs.{archive_index}.{scp_or_ark} ark:- | \ nnet3-chain-shuffle-egs --buffer-size={buf_size} \ --srand={srand} ark:- ark:- | nnet3-chain-merge-egs \ --minibatch-size={num_chunk_per_mb} ark:- ark:- |" \ @@ -210,17 +223,17 @@ def train_new_models(dir, iter, srand, num_jobs, raw_model=raw_model_string, egs_dir=egs_dir, archive_index=archive_index, buf_size=shuffle_buffer_size, - num_chunk_per_mb=num_chunk_per_minibatch_str), + num_chunk_per_mb=num_chunk_per_minibatch_str, + multitask_egs_opts=multitask_egs_opts, + scp_or_ark=scp_or_ark), require_zero_status=True) threads.append(thread) - for thread in threads: thread.join() - def train_one_iteration(dir, iter, srand, egs_dir, num_jobs, num_archives_processed, num_archives, learning_rate, shrinkage_value, @@ -232,7 +245,8 @@ def train_one_iteration(dir, iter, srand, egs_dir, momentum, max_param_change, shuffle_buffer_size, frame_subsampling_factor, run_opts, dropout_edit_string="", - backstitch_training_scale=0.0, backstitch_training_interval=1): + backstitch_training_scale=0.0, backstitch_training_interval=1, + use_multitask_egs=False): """ Called from steps/nnet3/chain/train.py for one iteration for neural network training with LF-MMI objective @@ -264,7 +278,8 @@ def train_one_iteration(dir, iter, srand, egs_dir, compute_train_cv_probabilities( dir=dir, iter=iter, egs_dir=egs_dir, l2_regularize=l2_regularize, xent_regularize=xent_regularize, - leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts) + leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts, + use_multitask_egs=use_multitask_egs) if iter > 0: # Runs in the background @@ -321,7 +336,8 @@ def train_one_iteration(dir, iter, srand, egs_dir, # first few iterations (hard-coded as 15) backstitch_training_scale=(backstitch_training_scale * iter / 15 if iter < 15 else backstitch_training_scale), - backstitch_training_interval=backstitch_training_interval) + backstitch_training_interval=backstitch_training_interval, + use_multitask_egs=use_multitask_egs) [models_to_average, best_model] = common_train_lib.get_successful_models( num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter)) @@ -373,7 +389,7 @@ def check_for_required_files(feat_dir, tree_dir, lat_dir): def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, max_lda_jobs=None, rand_prune=4.0, - lda_opts=None): + lda_opts=None, use_multitask_egs=False): """ Function to estimate and write LDA matrix from cegs This function is exactly similar to the version in module @@ -383,17 +399,28 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, if max_lda_jobs is not None: if num_lda_jobs > max_lda_jobs: num_lda_jobs = max_lda_jobs + multitask_egs_opts = common_train_lib.get_multitask_egs_opts( + egs_dir, + egs_prefix="cegs.", + archive_index="JOB", + use_multitask_egs=use_multitask_egs) + scp_or_ark = "scp" if use_multitask_egs else "ark" + egs_rspecifier = ( + "ark:nnet3-chain-copy-egs {multitask_egs_opts} " + "{scp_or_ark}:{egs_dir}/cegs.JOB.{scp_or_ark} ark:- |" + "".format(egs_dir=egs_dir, scp_or_ark=scp_or_ark, + multitask_egs_opts=multitask_egs_opts)) # Write stats with the same format as stats for LDA. common_lib.execute_command( """{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \ nnet3-chain-acc-lda-stats --rand-prune={rand_prune} \ - {dir}/init.raw "ark:{egs_dir}/cegs.JOB.ark" \ + {dir}/init.raw "{egs_rspecifier}" \ {dir}/JOB.lda_stats""".format( command=run_opts.command, num_lda_jobs=num_lda_jobs, dir=dir, - egs_dir=egs_dir, + egs_rspecifier=egs_rspecifier, rand_prune=rand_prune)) # the above command would have generated dir/{1..num_lda_jobs}.lda_stats @@ -448,32 +475,50 @@ def prepare_initial_acoustic_model(dir, run_opts, srand=-1): def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, xent_regularize, leaky_hmm_coefficient, - run_opts): + run_opts, + use_multitask_egs=False): model = '{0}/{1}.mdl'.format(dir, iter) + scp_or_ark = "scp" if use_multitask_egs else "ark" + egs_suffix = ".scp" if use_multitask_egs else ".cegs" + + multitask_egs_opts = common_train_lib.get_multitask_egs_opts( + egs_dir, + egs_prefix="valid_diagnostic.", + use_multitask_egs=use_multitask_egs) + common_lib.background_command( """{command} {dir}/log/compute_prob_valid.{iter}.log \ nnet3-chain-compute-prob --l2-regularize={l2} \ --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ - "ark,bg:nnet3-chain-copy-egs ark:{egs_dir}/valid_diagnostic.cegs \ + "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/valid_diagnostic{egs_suffix} \ ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \ """.format(command=run_opts.command, dir=dir, iter=iter, model=model, l2=l2_regularize, leaky=leaky_hmm_coefficient, xent_reg=xent_regularize, - egs_dir=egs_dir)) + egs_dir=egs_dir, + multitask_egs_opts=multitask_egs_opts, + scp_or_ark=scp_or_ark, egs_suffix=egs_suffix)) + + multitask_egs_opts = common_train_lib.get_multitask_egs_opts( + egs_dir, + egs_prefix="train_diagnostic.", + use_multitask_egs=use_multitask_egs) common_lib.background_command( """{command} {dir}/log/compute_prob_train.{iter}.log \ nnet3-chain-compute-prob --l2-regularize={l2} \ --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ - "ark,bg:nnet3-chain-copy-egs ark:{egs_dir}/train_diagnostic.cegs \ + "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/train_diagnostic{egs_suffix} \ ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \ """.format(command=run_opts.command, dir=dir, iter=iter, model=model, l2=l2_regularize, leaky=leaky_hmm_coefficient, xent_reg=xent_regularize, - egs_dir=egs_dir)) + egs_dir=egs_dir, + multitask_egs_opts=multitask_egs_opts, + scp_or_ark=scp_or_ark, egs_suffix=egs_suffix)) def compute_progress(dir, iter, run_opts): @@ -493,10 +538,12 @@ def compute_progress(dir, iter, run_opts): model=model, prev_model=prev_model)) + def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str, egs_dir, leaky_hmm_coefficient, l2_regularize, xent_regularize, run_opts, - sum_to_one_penalty=0.0): + sum_to_one_penalty=0.0, + use_multitask_egs=False): """ Function to do model combination In the nnet3 setup, the logic @@ -522,6 +569,14 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st print("{0}: warning: model file {1} does not exist " "(final combination)".format(sys.argv[0], model_file)) + scp_or_ark = "scp" if use_multitask_egs else "ark" + egs_suffix = ".scp" if use_multitask_egs else ".cegs" + + multitask_egs_opts = common_train_lib.get_multitask_egs_opts( + egs_dir, + egs_prefix="combine.", + use_multitask_egs=use_multitask_egs) + # We reverse the order of the raw model strings so that the freshest one # goes first. This is important for systems that include batch # normalization-- it means that the freshest batch-norm stats are used. @@ -539,7 +594,7 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st --sum-to-one-penalty={penalty} \ --enforce-positive-weights=true \ --verbose=3 {dir}/den.fst {raw_models} \ - "ark,bg:nnet3-chain-copy-egs ark:{egs_dir}/combine.cegs ark:- | \ + "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/combine{egs_suffix} ark:- | \ nnet3-chain-merge-egs --minibatch-size={num_chunk_per_mb} \ ark:- ark:- |" - \| \ nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl \ @@ -554,7 +609,9 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st penalty=sum_to_one_penalty, num_chunk_per_mb=num_chunk_per_minibatch_str, num_iters=num_iters, - egs_dir=egs_dir)) + egs_dir=egs_dir, + multitask_egs_opts=multitask_egs_opts, + scp_or_ark=scp_or_ark, egs_suffix=egs_suffix)) # Compute the probability of the final, combined model with # the same subset we used for the previous compute_probs, as the @@ -563,4 +620,5 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st dir=dir, iter='final', egs_dir=egs_dir, l2_regularize=l2_regularize, xent_regularize=xent_regularize, leaky_hmm_coefficient=leaky_hmm_coefficient, - run_opts=run_opts) + run_opts=run_opts, + use_multitask_egs=use_multitask_egs) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 559e7498fb7..a3beb2e5bef 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -399,7 +399,7 @@ def verify_egs_dir(egs_dir, feat_dim, ivector_dim, ivector_extractor_id, if (feat_dim != 0 and feat_dim != egs_feat_dim) or (ivector_dim != egs_ivector_dim): raise Exception("There is mismatch between featdim/ivector_dim of " "the current experiment and the provided " - "egs directory") + "egs directory: egs_dim: {0} vs {1} and ivector_dim {2} vs {3}".format(feat_dim, egs_feat_dim, ivector_dim, egs_ivector_dim)) if (((egs_ivector_id is None) and (ivector_extractor_id is not None)) or ((egs_ivector_id is not None) and (ivector_extractor_id is None))): diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py index 4d142ba3266..47abec00bde 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py @@ -322,21 +322,32 @@ def train_one_iteration(dir, iter, srand, egs_dir, def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, max_lda_jobs=None, rand_prune=4.0, - lda_opts=None): + lda_opts=None, use_multitask_egs=False): if max_lda_jobs is not None: if num_lda_jobs > max_lda_jobs: num_lda_jobs = max_lda_jobs + multitask_egs_opts = common_train_lib.get_multitask_egs_opts( + egs_dir, + egs_prefix="egs.", + archive_index="JOB", + use_multitask_egs=use_multitask_egs) + scp_or_ark = "scp" if use_multitask_egs else "ark" + egs_rspecifier = ( + "ark:nnet3-copy-egs {multitask_egs_opts} " + "{scp_or_ark}:{egs_dir}/egs.JOB.{scp_or_ark} ark:- |" + "".format(egs_dir=egs_dir, scp_or_ark=scp_or_ark, + multitask_egs_opts=multitask_egs_opts)) # Write stats with the same format as stats for LDA. common_lib.execute_command( """{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \ nnet3-acc-lda-stats --rand-prune={rand_prune} \ - {dir}/init.raw "ark:{egs_dir}/egs.JOB.ark" \ + {dir}/init.raw "{egs_rspecifier}" \ {dir}/JOB.lda_stats""".format( command=run_opts.command, num_lda_jobs=num_lda_jobs, dir=dir, - egs_dir=egs_dir, + egs_rspecifier=egs_rspecifier, rand_prune=rand_prune)) # the above command would have generated dir/{1..num_lda_jobs}.lda_stats diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree_from_lats.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree_from_lats.sh new file mode 100755 index 00000000000..6ed988062b3 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/build_tree_from_lats.sh @@ -0,0 +1,201 @@ +#!/bin/bash +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. + + +# This script builds a tree for use in the 'chain' systems (although the script +# itself is pretty generic and doesn't use any 'chain' binaries). This is just +# like the first stages of a standard system, like 'train_sat.sh', except it +# does 'convert-ali' to convert alignments to a monophone topology just created +# from the 'lang' directory (in case the topology is different from where you +# got the system's alignments from), and it stops after the tree-building and +# model-initialization stage, without re-estimating the Gaussians or training +# the transitions. + + +# Begin configuration section. +stage=-5 +exit_stage=-100 # you can use this to require it to exit at the + # beginning of a specific stage. Not all values are + # supported. +cmd=run.pl +context_opts= # e.g. set this to "--context-width 5 --central-position 2" for quinphone. +cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves +frame_subsampling_factor=1 +alignment_subsampling_factor=1 +leftmost_questions_truncate=-1 # note: this used to default to 10, but we never + # use this option now with value != -1, and + # we're changing the default +acwt=0.1 +tree_stats_opts= +cluster_phones_opts= +repeat_frames=false +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# != 5 ]; then + echo "Usage: steps/train_sat.sh <#leaves> " + echo " e.g.: steps/train_sat.sh 2500 15000 data/train_si84 data/lang exp/tri2b_lats_si84 exp/tri3b" + echo "Main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --config # config containing options" + echo " --stage # stage to do partial re-run from." + echo " --repeat-frames # Only affects alignment conversion at" + echo " # the end. If true, generate an " + echo " # alignment using the frame-subsampled " + echo " # topology that is repeated " + echo " # --frame-subsampling-factor times " + echo " # and interleaved, to be the same " + echo " # length as the original alignment " + echo " # (useful for cross-entropy training " + echo " # of reduced frame rate systems)." + exit 1; +fi + +numleaves=$1 +data=$2 +lang=$3 +lat_dir=$4 +dir=$5 + +for f in $data/feats.scp $lang/phones.txt $lat_dir/final.mdl $lat_dir/lat.1.gz; do + [ ! -f $f ] && echo "train_sat.sh: no such file $f" && exit 1; +done + +oov=`cat $lang/oov.int` +nj=`cat $lat_dir/num_jobs` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` +ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1; +sdata=$data/split$nj; +splice_opts=`cat $lat_dir/splice_opts 2>/dev/null` # frame-splicing options. +cmvn_opts=`cat $lat_dir/cmvn_opts 2>/dev/null` +delta_opts=`cat $lat_dir/delta_opts 2>/dev/null` + +mkdir -p $dir/log +cp $lat_dir/splice_opts $dir 2>/dev/null # frame-splicing options. +cp $lat_dir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. +cp $lat_dir/delta_opts $dir 2>/dev/null # delta option. + +utils/lang/check_phones_compatible.sh $lang/phones.txt $lat_dir/phones.txt || exit 1; +cp $lang/phones.txt $dir || exit 1; + +echo $nj >$dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +# Set up features. + +if [ -f $lat_dir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +## Set up speaker-independent features. +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $lat_dir/final.mat ark:- ark:- |" + cp $lat_dir/final.mat $dir + cp $lat_dir/full.mat $dir 2>/dev/null + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac + +# Add fMLLR transforms if available +if [ -f $lat_dir/trans.1 ]; then + echo "$0: Using transforms from $lat_dir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$lat_dir/trans.JOB ark:- ark:- |" +fi + +# Do subsampling of feats, if needed +if [ $frame_subsampling_factor -gt 1 ]; then + feats="$feats subsample-feats --n=$frame_subsampling_factor ark:- ark:- |" +fi + +if [ $stage -le -5 ]; then + echo "$0: Initializing monophone model (for alignment conversion, in case topology changed)" + + [ ! -f $lang/phones/sets.int ] && exit 1; + shared_phones_opt="--shared-phones=$lang/phones/sets.int" + # get feature dimension + example_feats="`echo $feats | sed s/JOB/1/g`"; + if ! feat_dim=$(feat-to-dim "$example_feats" - 2>/dev/null) || [ -z $feat_dim ]; then + feat-to-dim "$example_feats" - # to see the error message. + echo "error getting feature dimension" + exit 1; + fi + $cmd JOB=1 $dir/log/init_mono.log \ + gmm-init-mono $shared_phones_opt "--train-feats=$feats subset-feats --n=10 ark:- ark:-|" $lang/topo $feat_dim \ + $dir/mono.mdl $dir/mono.tree || exit 1; +fi + + +if [ $stage -le -4 ]; then + # Get tree stats. + echo "$0: Accumulating tree stats" + $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \ + lattice-best-path --acoustic-scale=$acwt \ + "ark:gunzip -c $lat_dir/lat.JOB.gz|" ark:/dev/null ark:- \| \ + convert-ali --frame-subsampling-factor=$alignment_subsampling_factor \ + $lat_dir/final.mdl $dir/mono.mdl $dir/mono.tree ark:- ark:- \| \ + acc-tree-stats $context_opts $tree_stats_opts --ci-phones=$ciphonelist $dir/mono.mdl \ + "$feats" ark:- $dir/JOB.treeacc || exit 1; + [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1; + $cmd $dir/log/sum_tree_acc.log \ + sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1; + rm $dir/*.treeacc +fi + +if [ $stage -le -3 ] && $train_tree; then + echo "$0: Getting questions for tree clustering." + # preparing questions, roots file... + $cmd $dir/log/questions.log \ + cluster-phones $cluster_phones_opts $context_opts $dir/treeacc \ + $lang/phones/sets.int $dir/questions.int || exit 1; + cat $lang/phones/extra_questions.int >> $dir/questions.int + $cmd $dir/log/compile_questions.log \ + compile-questions --leftmost-questions-truncate=$leftmost_questions_truncate \ + $context_opts $lang/topo $dir/questions.int $dir/questions.qst || exit 1; + + # questions_truncated.int will be needed later on when we build the phone + # language model for 'chain' training. It's a mechanism of keeping the graph + # small. + if [ $leftmost_questions_truncate -gt 0 ]; then + head -n $leftmost_questions_truncate $dir/questions.int > $dir/questions_truncated.int + else + cp $dir/questions.int $dir/questions_truncated.int + fi + + echo "$0: Building the tree" + $cmd $dir/log/build_tree.log \ + build-tree $context_opts --verbose=1 --max-leaves=$numleaves \ + --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ + $dir/questions.qst $lang/topo $dir/tree || exit 1; +fi + +if [ $stage -le -2 ]; then + echo "$0: Initializing the model" + gmm-init-model --write-occs=$dir/1.occs \ + $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1; + grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning."; + rm $dir/treeacc +fi + +if [ $stage -le -1 ]; then + # Convert the alignments to the new tree. Note: we likely will not use these + # converted alignments in the CTC system directly, but they could be useful + # for other purposes. + echo "$0: Converting alignments from $lat_dir to use current tree" + $cmd JOB=1:$nj $dir/log/convert.JOB.log \ + lattice-best-path --acoustic-scale=$acwt \ + "ark:gunzip -c $lat_dir/lat.JOB.gz |" ark:/dev/null ark:- \| \ + convert-ali --repeat-frames=$repeat_frames \ + --frame-subsampling-factor=$alignment_subsampling_factor \ + $lat_dir/final.mdl $dir/1.mdl $dir/tree \ + ark:- "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +cp $dir/1.mdl $dir/final.mdl + +echo $0: Done building tree + diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh new file mode 100755 index 00000000000..6892a2ff1ee --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh @@ -0,0 +1,275 @@ +#!/bin/bash +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# 2017 Vimal Manohar +# Apache 2.0. + +# This script is similar to steps/nnet3/chain/build_tree.sh but supports +# getting statistics from multiple alignment sources. + + +# Begin configuration section. +stage=-5 +exit_stage=-100 # you can use this to require it to exit at the + # beginning of a specific stage. Not all values are + # supported. +cmd=run.pl +use_fmllr=true # If true, fmllr transforms will be applied from the alignment directories. + # Otherwise, no fmllr will be applied even if alignment directory contains trans.* +context_opts= # e.g. set this to "--context-width 5 --central-position 2" for quinphone. +cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves +frame_subsampling_factor=1 # frame subsampling factor of output w.r.t. to the input features +tree_stats_opts= +cluster_phones_opts= +repeat_frames=false +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -lt 5 ]; then + echo "Usage: steps/nnet3/chain/build_tree_multiple_sources.sh <#leaves> [ ... ] " + echo " e.g.: steps/nnet3/chain/build_tree_multiple_sources.sh 15000 data/lang data/train_sup exp/tri3_ali data/train_unsup exp/tri3/best_path_train_unsup exp/tree_semi" + echo "Main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --config # config containing options" + echo " --stage # stage to do partial re-run from." + echo " --repeat-frames # Only affects alignment conversion at" + echo " # the end. If true, generate an " + echo " # alignment using the frame-subsampled " + echo " # topology that is repeated " + echo " # --frame-subsampling-factor times " + echo " # and interleaved, to be the same " + echo " # length as the original alignment " + echo " # (useful for cross-entropy training " + echo " # of reduced frame rate systems)." + exit 1; +fi + +numleaves=$1 +lang=$2 +dir=${@: -1} # last argument to the script +shift 2; +data_and_alidirs=( $@ ) # read the remaining arguments into an array +unset data_and_alidirs[${#data_and_alidirs[@]}-1] # 'pop' the last argument which is odir +num_sys=$[${#data_and_alidirs[@]}] # number of systems to combine + +if (( $num_sys % 2 != 0 )); then + echo "$0: The data and alignment arguments must be an even number of arguments." + exit 1 +fi + +num_sys=$((num_sys / 2)) + +data=$dir/data_tmp +mkdir -p $data + +mkdir -p $dir +alidir=`echo ${data_and_alidirs[1]}` + +datadirs=() +alidirs=() +for n in `seq 0 $[num_sys-1]`; do + datadirs[$n]=${data_and_alidirs[$[2*n]]} + alidirs[$n]=${data_and_alidirs[$[2*n+1]]} +done + +utils/combine_data.sh $data ${datadirs[@]} || exit 1 + +for f in $data/feats.scp $lang/phones.txt $alidir/final.mdl $alidir/ali.1.gz; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +oov=`cat $lang/oov.int` +nj=`cat $alidir/num_jobs` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` +ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1; +sdata=$data/split$nj; +splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options. +cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` || exit 1 +delta_opts=`cat $alidir/delta_opts 2>/dev/null` + +mkdir -p $dir/log +cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options. +cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. +cp $alidir/delta_opts $dir 2>/dev/null # delta option. + +utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1; +cp $lang/phones.txt $dir || exit 1; + +echo $nj >$dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +# Set up features. +if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi + +echo "$0: feature type is $feat_type" + +feats=() +feats_one=() +for n in `seq 0 $[num_sys-1]`; do + this_nj=$(cat ${alidirs[$n]}/num_jobs) || exit 1 + this_sdata=${datadirs[$n]}/split$this_nj + [[ -d $this_sdata && ${datadirs[$n]}/feats.scp -ot $this_sdata ]] || split_data.sh ${datadirs[$n]} $this_nj || exit 1; + ## Set up speaker-independent features. + case $feat_type in + delta) feats[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/JOB/utt2spk scp:$this_sdata/JOB/cmvn.scp scp:$this_sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |" + feats_one[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/1/utt2spk scp:$this_sdata/1/cmvn.scp scp:$this_sdata/1/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; + lda) feats[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/JOB/utt2spk scp:$this_sdata/JOB/cmvn.scp scp:$this_sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + feats_one[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/1/utt2spk scp:$this_sdata/1/cmvn.scp scp:$this_sdata/1/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + cp $alidir/final.mat $dir + cp $alidir/full.mat $dir 2>/dev/null + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; + esac + + if $use_fmllr; then + if [ ! -f ${alidirs[$n]}/trans.1 ]; then + echo "$0: Could not find fMLLR transforms in ${alidirs[$n]}" + exit 1 + fi + + echo "$0: Using transforms from ${alidirs[$n]}" + feats[i]="${feats[i]} transform-feats --utt2spk=ark:$this_sdata/JOB/utt2spk ark,s,cs:${alidirs[$n]}/trans.JOB ark:- ark:- |" + feats_one[i]="${feats_one[i]} transform-feats --utt2spk=ark:$this_sdata/1/utt2spk ark,s,cs:${alidirs[$n]}/trans.1 ark:- ark:- |" + fi + + # Do subsampling of feats, if needed + if [ $frame_subsampling_factor -gt 1 ]; then + feats[$n]="${feats[$n]} subsample-feats --n=$frame_subsampling_factor ark:- ark:- |" + feats_one[$n]="${feats_one[$n]} subsample-feats --n=$frame_subsampling_factor ark:- ark:- |" + fi +done + +if [ $stage -le -5 ]; then + echo "$0: Initializing monophone model (for alignment conversion, in case topology changed)" + + [ ! -f $lang/phones/sets.int ] && exit 1; + shared_phones_opt="--shared-phones=$lang/phones/sets.int" + # get feature dimension + example_feats="`echo ${feats[0]} | sed s/JOB/1/g`"; + if ! feat_dim=$(feat-to-dim "$example_feats" - 2>/dev/null) || [ -z $feat_dim ]; then + feat-to-dim "$example_feats" - # to see the error message. + echo "error getting feature dimension" + exit 1; + fi + + for n in `seq 0 $[num_sys-1]`; do + copy-feats "${feats_one[$n]}" ark:- + done | copy-feats ark:- ark:$dir/tmp.ark + + $cmd $dir/log/init_mono.log \ + gmm-init-mono $shared_phones_opt \ + "--train-feats=ark:subset-feats --n=10 ark:$dir/tmp.ark ark:- |" $lang/topo $feat_dim \ + $dir/mono.mdl $dir/mono.tree || exit 1 +fi + + +if [ $stage -le -4 ]; then + # Get tree stats. + + for n in `seq 0 $[num_sys-1]`; do + echo "$0: Accumulating tree stats" + this_data=${datadirs[$n]} + this_alidir=${alidirs[$n]} + this_nj=$(cat $this_alidir/num_jobs) || exit 1 + this_frame_subsampling_factor=1 + if [ -f $this_alidir/frame_subsampling_factor ]; then + this_frame_subsampling_factor=$(cat $this_alidir/frame_subsampling_factor) + fi + + if (( $frame_subsampling_factor % $this_frame_subsampling_factor != 0 )); then + echo "$0: frame-subsampling-factor=$frame_subsampling_factor is not " + echo "divisible by $this_frame_subsampling_factor (that of $this_alidir)" + exit 1 + fi + + this_frame_subsampling_factor=$((frame_subsampling_factor / this_frame_subsampling_factor)) + $cmd JOB=1:$this_nj $dir/log/acc_tree.$n.JOB.log \ + convert-ali --frame-subsampling-factor=$this_frame_subsampling_factor \ + $this_alidir/final.mdl $dir/mono.mdl $dir/mono.tree "ark:gunzip -c $this_alidir/ali.JOB.gz|" ark:- \| \ + acc-tree-stats $context_opts $tree_stats_opts --ci-phones=$ciphonelist $dir/mono.mdl \ + "${feats[$n]}" ark:- $dir/$n.JOB.treeacc || exit 1; + [ "`ls $dir/$n.*.treeacc | wc -w`" -ne "$this_nj" ] && echo "$0: Wrong #tree-accs for data $n $this_data" && exit 1; + done + + $cmd $dir/log/sum_tree_acc.log \ + sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1; + rm $dir/*.treeacc +fi + +if [ $stage -le -3 ] && $train_tree; then + echo "$0: Getting questions for tree clustering." + # preparing questions, roots file... + $cmd $dir/log/questions.log \ + cluster-phones $cluster_phones_opts $context_opts $dir/treeacc \ + $lang/phones/sets.int $dir/questions.int || exit 1; + cat $lang/phones/extra_questions.int >> $dir/questions.int + $cmd $dir/log/compile_questions.log \ + compile-questions \ + $context_opts $lang/topo $dir/questions.int $dir/questions.qst || exit 1; + + echo "$0: Building the tree" + $cmd $dir/log/build_tree.log \ + build-tree $context_opts --verbose=1 --max-leaves=$numleaves \ + --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ + $dir/questions.qst $lang/topo $dir/tree || exit 1; +fi + +if [ $stage -le -2 ]; then + echo "$0: Initializing the model" + gmm-init-model --write-occs=$dir/1.occs \ + $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1; + grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning."; + rm $dir/treeacc +fi + +if [ $stage -le -1 ]; then + # Convert the alignments to the new tree. Note: we likely will not use these + # converted alignments in the CTC system directly, but they could be useful + # for other purposes. + + for n in `seq 0 $[num_sys-1]`; do + this_alidir=${alidirs[$n]} + this_nj=$(cat $this_alidir/num_jobs) || exit 1 + + this_frame_subsampling_factor=1 + if [ -f $this_alidir/frame_subsampling_factor ]; then + this_frame_subsampling_factor=$(cat $this_alidir/frame_subsampling_factor) + fi + + if (( $frame_subsampling_factor % $this_frame_subsampling_factor != 0 )); then + echo "$0: frame-subsampling-factor=$frame_subsampling_factor is not " + echo "divisible by $this_frame_subsampling_factor (hat of $this_alidir)" + exit 1 + fi + + echo "$0: frame-subsampling-factor for $this_alidir is $this_frame_subsampling_factor" + + this_frame_subsampling_factor=$((frame_subsampling_factor / this_frame_subsampling_factor)) + echo "$0: Converting alignments from $this_alidir to use current tree" + $cmd JOB=1:$this_nj $dir/log/convert.$n.JOB.log \ + convert-ali --repeat-frames=$repeat_frames \ + --frame-subsampling-factor=$this_frame_subsampling_factor \ + $this_alidir/final.mdl $dir/1.mdl $dir/tree "ark:gunzip -c $this_alidir/ali.JOB.gz |" \ + ark,scp:$dir/ali.$n.JOB.ark,$dir/ali.$n.JOB.scp || exit 1 + + for i in `seq $this_nj`; do + cat $dir/ali.$n.$i.scp + done > $dir/ali.$n.scp || exit 1 + done + + for n in `seq 0 $[num_sys-1]`; do + cat $dir/ali.$n.scp + done | sort -k1,1 > $dir/ali.scp || exit 1 + + utils/split_data.sh $data $nj + $cmd JOB=1:$nj $dir/log/copy_alignments.JOB.log \ + copy-int-vector "scp:utils/filter_scp.pl $data/split$nj/JOB/utt2spk $dir/ali.scp |" \ + "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1 +fi + +cp $dir/1.mdl $dir/final.mdl + +echo $0: Done building tree diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index cec6f8e166f..f3202778daa 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -63,6 +63,17 @@ online_ivector_dir= # can be used if we are including speaker information as iV cmvn_opts= # can be used for specifying CMVN options, if feature type is not lda (if lda, # it doesn't make sense to use different options than were used as input to the # LDA transform). This is used to turn off CMVN in the online-nnet experiments. +lattice_lm_scale= # If supplied, the graph/lm weight of the lattices will be + # used (with this scale) in generating supervisions +egs_weight=1.0 # The weight which determines how much each training example + # contributes to gradients while training (can be used + # to down/up-weight a dataset) +lattice_prune_beam= # If supplied, the lattices will be pruned to this beam, + # before being used to get supervisions. +acwt=0.1 # For pruning +phone_insertion_penalty= +deriv_weights_scp= +generate_egs_scp=false echo "$0 $@" # Print the command line for logging @@ -184,6 +195,8 @@ if [ -f $dir/trans.scp ]; then train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" fi +tree-info $chaindir/tree | grep num-pdfs | awk '{print $2}' > $dir/info/num_pdfs || exit 1 + if [ ! -z "$online_ivector_dir" ]; then ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; echo $ivector_dim > $dir/info/ivector_dim @@ -260,8 +273,10 @@ fi if [ $stage -le 2 ]; then echo "$0: copying training lattices" + [ ! -z $lattice_prune_beam ] && \ + prune_cmd="ark:- | lattice-prune --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:-" $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \ - lattice-copy "ark:gunzip -c $latdir/lat.JOB.gz|" ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; + lattice-copy "ark:gunzip -c $latdir/lat.JOB.gz|" $prune_cmd ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; for id in $(seq $num_lat_jobs); do cat $dir/lat.$id.scp; done > $dir/lat.scp fi @@ -271,6 +286,7 @@ egs_opts="--left-context=$left_context --right-context=$right_context --num-fram [ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial" [ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final" +[ ! -z "$deriv_weights_scp" ] && egs_opts="$egs_opts --deriv-weights-rspecifier=scp:$deriv_weights_scp" chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$alignment_subsampling_factor" [ ! -z $right_tolerance ] && \ @@ -279,6 +295,20 @@ chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$ali [ ! -z $left_tolerance ] && \ chain_supervision_all_opts="$chain_supervision_all_opts --left-tolerance=$left_tolerance" +normalization_scale=1.0 +if [ ! -z "$lattice_lm_scale" ]; then + chain_supervision_all_opts="$chain_supervision_all_opts --lm-scale=$lattice_lm_scale" + normalization_scale=$(perl -e " + if ($lattice_lm_scale >= 1.0 || $lattice_lm_scale < 0) { + print STDERR \"Invalid --lattice-lm-scale $lattice_lm_scale\"; + exit(1); + } + print (1.0 - $lattice_lm_scale);") +fi + +[ ! -z $phone_insertion_penalty ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --phone-ins-penalty=$phone_insertion_penalty" + echo $left_context > $dir/info/left_context echo $right_context > $dir/info/right_context echo $left_context_initial > $dir/info/left_context_initial @@ -299,7 +329,7 @@ if [ $stage -le 3 ]; then chain-get-supervision $chain_supervision_all_opts $chaindir/tree $chaindir/0.trans_mdl \ ark:- ark:- \| \ nnet3-chain-get-egs $ivector_opts --srand=$srand \ - $egs_opts $chaindir/normalization.fst \ + $egs_opts --normalization-scale=$normalization_scale $chaindir/normalization.fst \ "$valid_feats" ark,s,cs:- "ark:$dir/valid_all.cegs" || touch $dir/.error & $cmd $dir/log/create_train_subset.log \ utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \ @@ -307,27 +337,40 @@ if [ $stage -le 3 ]; then chain-get-supervision $chain_supervision_all_opts \ $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \ nnet3-chain-get-egs $ivector_opts --srand=$srand \ - $egs_opts $chaindir/normalization.fst \ + $egs_opts --normalization-scale=$normalization_scale $chaindir/normalization.fst \ "$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || touch $dir/.error & wait; [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1 echo "... Getting subsets of validation examples for diagnostics and combination." + if $generate_egs_scp; then + valid_diagnostic_output="ark,scp:$dir/valid_diagnostic.cegs,$dir/valid_diagnostic.scp" + train_diagnostic_output="ark,scp:$dir/train_diagnostic.cegs,$dir/train_diagnostic.scp" + else + valid_diagnostic_output="ark:$dir/valid_diagnostic.cegs" + train_diagnostic_output="ark:$dir/train_diagnostic.cegs" + fi $cmd $dir/log/create_valid_subset_combine.log \ nnet3-chain-subset-egs --n=$num_valid_egs_combine ark:$dir/valid_all.cegs \ ark:$dir/valid_combine.cegs || touch $dir/.error & $cmd $dir/log/create_valid_subset_diagnostic.log \ nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/valid_all.cegs \ - ark:$dir/valid_diagnostic.cegs || touch $dir/.error & + $valid_diagnostic_output || touch $dir/.error & $cmd $dir/log/create_train_subset_combine.log \ nnet3-chain-subset-egs --n=$num_train_egs_combine ark:$dir/train_subset_all.cegs \ ark:$dir/train_combine.cegs || touch $dir/.error & $cmd $dir/log/create_train_subset_diagnostic.log \ nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \ - ark:$dir/train_diagnostic.cegs || touch $dir/.error & + $train_diagnostic_output || touch $dir/.error & wait sleep 5 # wait for file system to sync. - cat $dir/valid_combine.cegs $dir/train_combine.cegs > $dir/combine.cegs + if $generate_egs_scp; then + cat $dir/valid_combine.cegs $dir/train_combine.cegs | \ + nnet3-chain-copy-egs ark:- ark,scp:$dir/combine.cegs,$dir/combine.scp + rm $dir/{train,valid}_combine.scp + else + cat $dir/valid_combine.cegs $dir/train_combine.cegs > $dir/combine.cegs + fi for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do [ ! -s $f ] && echo "No examples in file $f" && exit 1; @@ -357,6 +400,7 @@ if [ $stage -le 4 ]; then utils/filter_scp.pl $sdata/JOB/utt2spk $dir/lat.scp \| \ lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ chain-get-supervision $chain_supervision_all_opts \ + --weight=$egs_weight \ $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \ nnet3-chain-get-egs $ivector_opts --srand=\$[JOB+$srand] $egs_opts \ --num-frames-overlap=$frames_overlap_per_eg \ @@ -376,16 +420,34 @@ if [ $stage -le 5 ]; then done if [ $archives_multiple == 1 ]; then # normal case. + if $generate_egs_scp; then + output_archive="ark,scp:$dir/cegs.JOB.ark,$dir/cegs.JOB.scp" + else + output_archive="ark:$dir/cegs.JOB.ark" + fi $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ - nnet3-chain-normalize-egs $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ - nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:$dir/cegs.JOB.ark || exit 1; + nnet3-chain-normalize-egs --normalization-scale=$normalization_scale $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- $output_archive || exit 1; + + if $generate_egs_scp; then + #concatenate cegs.JOB.scp in single cegs.scp + rm -rf $dir/cegs.scp + for j in $(seq $num_archives_intermediate); do + cat $dir/cegs.$j.scp || exit 1; + done > $dir/cegs.scp || exit 1; + for f in $dir/cegs.*.scp; do rm $f; done + fi else # we need to shuffle the 'intermediate archives' and then split into the # final archives. we create soft links to manage this splitting, because # otherwise managing the output names is quite difficult (and we don't want # to submit separate queue jobs for each intermediate archive, because then # the --max-jobs-run option is hard to enforce). - output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/cegs.JOB.$y.ark; done)" + if $generate_egs_scp; then + output_archives="$(for y in $(seq $archives_multiple); do echo ark,scp:$dir/cegs.JOB.$y.ark,$dir/cegs.JOB.$y.scp; done)" + else + output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/cegs.JOB.$y.ark; done)" + fi for x in $(seq $num_archives_intermediate); do for y in $(seq $archives_multiple); do archive_index=$[($x-1)*$archives_multiple+$y] @@ -394,9 +456,20 @@ if [ $stage -le 5 ]; then done done $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ - nnet3-chain-normalize-egs $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-normalize-egs --normalization-scale=$normalization_scale $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:- \| \ nnet3-chain-copy-egs ark:- $output_archives || exit 1; + + if $generate_egs_scp; then + #concatenate cegs.JOB.scp in single cegs.scp + rm -rf $dir/cegs.scp + for j in $(seq $num_archives_intermediate); do + for y in $(seq $num_archives_intermediate); do + cat $dir/cegs.$j.$y.scp || exit 1; + done + done > $dir/cegs.scp || exit 1; + for f in $dir/cegs.*.*.scp; do rm $f; done + fi fi fi diff --git a/egs/wsj/s5/steps/nnet3/chain/make_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_den_fst.sh new file mode 100755 index 00000000000..3467e887cd5 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/make_den_fst.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# Copyright 2014-17 Vimal Manohar + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script creates denominator FST (den.fst) and normalization.fst for +# chain training. It additional copies the transition model and tree from the +# first alignment directory to the chain directory. +# This script can accept multiple sources of alignments that can be +# weighted to estimate phone LM. + +set -o pipefail + +# begin configuration section. +cmd=run.pl +stage=-10 +weights= +#end configuration section. + +help_message="Usage: "$(basename $0)" [options] [ ...] + E.g. "$(basename $0)" exp/tri1_ali exp/tri2_ali exp/chain/tdnn_1a_sp +Options: + --cmd (run.pl|queue.pl...) # specify how to run the sub-processes. +"; + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -lt 2 ]; then + printf "$help_message\n"; + exit 1; +fi + +dir=${@: -1} # last argument to the script +ali_dirs=( $@ ) # read the remaining arguments into an array +unset ali_dirs[${#ali_dirs[@]}-1] # 'pop' the last argument which is odir +num_sys=${#ali_dirs[@]} # number of systems to combine + +mkdir -p $dir/log + +ali_dir=`echo ${ali_dirs[0]} | cut -d: -f1` + +for f in $ali_dir/ali.1.gz $ali_dir/final.mdl $ali_dir/tree; do + if [ ! -f $f ]; then + echo "$0: Could not find file $f" + exit 1 + fi +done + +cp $ali_dir/tree $dir/ || exit 1 + +for n in `seq 0 $[num_sys-1]`; do + adir=${ali_dirs[$n]} + alignments+=("ark:gunzip -c $adir/ali.*.gz | ali-to-phones $adir/final.mdl ark:- ark:- |") +done + +if [ $stage -le 1 ]; then + $cmd $dir/log/make_phone_lm.log \ + chain-est-phone-lm $lm_opts --scales="$weights" \ + "${alignments[@]}" $dir/phone_lm.fst || exit 1 +fi + +if [ $stage -le 2 ]; then + copy-transition-model $ali_dir/final.mdl $dir/0.trans_mdl +fi + +if [ $stage -le 3 ]; then + $cmd $dir/log/make_den_fst.log \ + chain-make-den-fst $dir/tree $dir/0.trans_mdl \ + $dir/phone_lm.fst \ + $dir/den.fst $dir/normalization.fst || exit 1 +fi + +exit 0 diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 6f9452c457c..c611d10edb1 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -377,13 +377,23 @@ def train(args, run_opts): logger.info("Copying the properties from {0} to {1}".format(egs_dir, args.dir)) common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) + if not os.path.exists('{0}/valid_diagnostic.cegs'.format(egs_dir)): + if (not os.path.exists('{0}/valid_diagnostic.scp'.format(egs_dir))): + raise Exception('neither {0}/valid_diagnostic.cegs nor ' + '{0}/valid_diagnostic.scp exist.' + 'This script expects one of them.'.format(egs_dir)) + use_multitask_egs = True + else: + use_multitask_egs = False + if (args.stage <= -2) and os.path.exists(args.dir+"/configs/init.config"): logger.info('Computing the preconditioning matrix for input features') chain_lib.compute_preconditioning_matrix( args.dir, egs_dir, num_archives, run_opts, max_lda_jobs=args.max_lda_jobs, - rand_prune=args.rand_prune) + rand_prune=args.rand_prune, + use_multitask_egs=use_multitask_egs) if (args.stage <= -1): logger.info("Preparing the initial acoustic model.") @@ -477,7 +487,8 @@ def train(args, run_opts): frame_subsampling_factor=args.frame_subsampling_factor, run_opts=run_opts, backstitch_training_scale=args.backstitch_training_scale, - backstitch_training_interval=args.backstitch_training_interval) + backstitch_training_interval=args.backstitch_training_interval, + use_multitask_egs=use_multitask_egs) if args.cleanup: # do a clean up everythin but the last 2 models, under certain @@ -512,7 +523,8 @@ def train(args, run_opts): l2_regularize=args.l2_regularize, xent_regularize=args.xent_regularize, run_opts=run_opts, - sum_to_one_penalty=args.combine_sum_to_one_penalty) + sum_to_one_penalty=args.combine_sum_to_one_penalty, + use_multitask_egs=use_multitask_egs) else: logger.info("Copying the last-numbered model to final.mdl") common_lib.force_symlink("{0}.mdl".format(num_iters), diff --git a/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py b/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py index 6372ba25e5e..860c444e342 100755 --- a/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py +++ b/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py @@ -98,6 +98,13 @@ def get_args(): parser.add_argument("--samples-per-iter", type=int, default=40000, help="The target number of egs in each archive of egs, " "(prior to merging egs). ") + parser.add_argument("--frames-per-iter", type=int, default=400000, + help="The target number of frames in each archive of " + "egs") + parser.add_argument("--frames-per-eg-list", type=str, default=None, + action=common_lib.NullstrToNoneAction, + help="Number of frames per eg for each input language " + "as a comma separated list") parser.add_argument("--num-jobs", type=int, default=20, help="This can be used for better randomization in distributing " "examples for different languages across egs.*.scp files, " @@ -107,7 +114,7 @@ def get_args(): help="If true, egs.ranges.*.txt are generated " "randomly w.r.t distribution of remaining examples in " "each language, otherwise it is generated sequentially.", - default=True, choices = ["false", "true"]) + default=True, choices=["false", "true"]) parser.add_argument("--max-archives", type=int, default=1000, help="max number of archives used to generate egs.*.scp") parser.add_argument("--seed", type=int, default=1, @@ -129,7 +136,7 @@ def get_args(): # now the positional arguments parser.add_argument("egs_scp_lists", nargs='+', help="list of egs.scp files per input language." - "e.g. exp/lang1/egs/egs.scp exp/lang2/egs/egs.scp") + "e.g. exp/lang1/egs/egs.scp exp/lang2/egs/egs.scp") parser.add_argument("egs_dir", help="Name of egs directory e.g. exp/tdnn_multilingual_sp/egs") @@ -153,7 +160,7 @@ def select_random_lang(lang_len, tot_egs, random_selection): count = 0 for l in range(len(lang_len)): if random_selection: - if rand_int <= (count + lang_len[l]): + if rand_int <= (count + lang_len[l]): return l else: count += lang_len[l] @@ -172,6 +179,10 @@ def process_multilingual_egs(args): scp_lists = args.egs_scp_lists num_langs = len(scp_lists) + frames_per_eg = ([1 for x in scp_lists] + if args.frames_per_eg_list is None + else [int(x) for x in args.frames_per_eg_list.split(',')]) + scp_files = [open(scp_lists[lang], 'r') for lang in range(num_langs)] lang2len = [0] * num_langs @@ -182,7 +193,7 @@ def process_multilingual_egs(args): # If weights are not provided, the weights are 1.0. if args.lang2weight is None: - lang2weight = [ 1.0 ] * num_langs + lang2weight = [1.0] * num_langs else: lang2weight = args.lang2weight.split(",") assert(len(lang2weight) == num_langs) @@ -195,10 +206,16 @@ def process_multilingual_egs(args): # Each element of all_egs (one per num_archive * num_jobs) is # an array of 3-tuples (lang-id, local-start-egs-line, num-egs) all_egs = [] - lang_len = lang2len[:] - # total num of egs in all languages - tot_num_egs = sum(lang2len[i] for i in range(len(lang2len))) - num_archives = max(1, min(args.max_archives, tot_num_egs / args.samples_per_iter)) + num_frames_in_lang = [frames_per_eg[i] * lang2len[i] + for i in range(num_langs)] + for lang in range(num_langs): + logger.info("Number of frames for language {0} " + "is {1}.".format(lang, num_frames_in_lang[lang])) + + # total num of frames in all languages + tot_num_frames = sum(num_frames_in_lang[i] for i in range(num_langs)) + num_archives = max(1, min(args.max_archives, + tot_num_frames / args.frames_per_iter)) num_arch_file = open("{0}/info/{1}num_archives".format( args.egs_dir, @@ -206,7 +223,7 @@ def process_multilingual_egs(args): "w") print("{0}".format(num_archives), file=num_arch_file) num_arch_file.close() - this_num_egs_per_archive = tot_num_egs / (num_archives * args.num_jobs) + this_num_frames_per_archive = tot_num_frames / (num_archives * args.num_jobs) logger.info("Generating {0}scp.. temporary files used to " "generate {0}.scp.".format(args.egs_prefix)) @@ -216,29 +233,36 @@ def process_multilingual_egs(args): "".format(args.egs_dir, args.egs_prefix, job + 1, archive_index + 1), "w") - this_egs = [] # this will be array of 2-tuples (lang-id start-frame num-frames) + # this will be array of 2-tuples (lang-id start-frame num-frames) + this_egs = [] num_egs = 0 - while num_egs <= this_num_egs_per_archive: - num_left_egs = sum(num_left_egs_per_lang for - num_left_egs_per_lang in lang_len) - if num_left_egs > 0: - lang_id = select_random_lang(lang_len, num_left_egs, rand_select) - start_egs = lang2len[lang_id] - lang_len[lang_id] + num_frames = 0 + while num_frames <= this_num_frames_per_archive: + num_frames_left = sum(num_frames_in_lang) + if num_frames_left > 0: + lang_id = select_random_lang(num_frames_in_lang, + num_frames_left, rand_select) + start_egs = ( + lang2len[lang_id] + - num_frames_in_lang[lang_id] / frames_per_eg[lang_id]) this_egs.append((lang_id, start_egs, args.minibatch_size)) for scpline in range(args.minibatch_size): scp_key = scp_files[lang_id].readline().splitlines()[0] print("{0} {1}".format(scp_key, lang_id), file=archfile) - lang_len[lang_id] = lang_len[lang_id] - args.minibatch_size - num_egs = num_egs + args.minibatch_size + num_frames_in_lang[lang_id] -= ( + args.minibatch_size * frames_per_eg[lang_id]) + num_egs += args.minibatch_size + num_frames += args.minibatch_size * frames_per_eg[lang_id] # If num of remaining egs in each lang is less than minibatch_size, # they are discarded. - if lang_len[lang_id] < args.minibatch_size: - lang_len[lang_id] = 0 - logger.info("Done processing data for language {0}".format( - lang_id)) + if (num_frames_in_lang[lang_id] + < args.minibatch_size * frames_per_eg[lang_id]): + num_frames_in_lang[lang_id] = 0 + logger.info("Done processing data for language {0}" + "".format(lang_id)) else: logger.info("Done processing data for all languages.") break @@ -315,4 +339,4 @@ def main(): if __name__ == "__main__": - main() + main() diff --git a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh index 3826dad11a9..dd8d9714905 100755 --- a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh +++ b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh @@ -19,13 +19,15 @@ minibatch_size=512 # it is the number of consecutive egs that we take from # access. This does not have to be the actual minibatch size; num_jobs=10 # helps for better randomness across languages # per archive. -samples_per_iter=400000 # this is the target number of egs in each archive of egs +frames_per_iter=400000 # this is the target number of egs in each archive of egs # (prior to merging egs). We probably should have called # it egs_per_iter. This is just a guideline; it will pick # a number that divides the number of samples in the # entire data. lang2weight= # array of weights one per input languge to scale example's output # w.r.t its input language during training. +allocate_opts= +egs_prefix=egs. stage=0 echo "$0 $@" # Print the command line for logging @@ -33,6 +35,12 @@ echo "$0 $@" # Print the command line for logging if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; +if [ $# -lt 3 ]; then + echo "Usage:$0 [opts] ... " + echo "Usage:$0 [opts] 2 exp/lang1/egs exp/lang2/egs exp/multi/egs" + exit 1; +fi + num_langs=$1 shift 1 @@ -47,7 +55,8 @@ if [ ${#args[@]} != $[$num_langs+1] ]; then exit 1; fi -required="egs.scp combine.scp train_diagnostic.scp valid_diagnostic.scp" +required="${egs_prefix}scp combine.scp train_diagnostic.scp valid_diagnostic.scp" +frames_per_eg_list= train_scp_list= train_diagnostic_scp_list= valid_diagnostic_scp_list= @@ -55,13 +64,14 @@ combine_scp_list= # read paramter from $egs_dir[0]/info and cmvn_opts # to write in multilingual egs_dir. -check_params="info/feat_dim info/ivector_dim info/left_context info/right_context info/frames_per_eg cmvn_opts" +check_params="info/feat_dim info/ivector_dim info/left_context info/right_context cmvn_opts" ivec_dim=`cat ${args[0]}/info/ivector_dim` if [ $ivec_dim -ne 0 ];then check_params="$check_params info/final.ie.id"; fi for param in $check_params; do - cat ${args[0]}/$param > $megs_dir/$param || exit 1; + cat ${args[0]}/$param > $megs_dir/$param || exit 1; done +cat ${args[0]}/cmvn_opts > $megs_dir/cmvn_opts || exit 1; # caution: the top-level nnet training for lang in $(seq 0 $[$num_langs-1]);do multi_egs_dir[$lang]=${args[$lang]} @@ -70,10 +80,19 @@ for lang in $(seq 0 $[$num_langs-1]);do echo "$0: no such file ${multi_egs_dir[$lang]}/$f." && exit 1; fi done - train_scp_list="$train_scp_list ${args[$lang]}/egs.scp" + train_scp_list="$train_scp_list ${args[$lang]}/${egs_prefix}scp" train_diagnostic_scp_list="$train_diagnostic_scp_list ${args[$lang]}/train_diagnostic.scp" valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${args[$lang]}/valid_diagnostic.scp" combine_scp_list="$combine_scp_list ${args[$lang]}/combine.scp" + + this_frames_per_eg=$(cat ${args[$lang]}/info/frames_per_eg) + + if [ $lang -eq 0 ]; then + frames_per_eg_list="$this_frames_per_eg" + echo $this_frames_per_eg > $megs_dir/info/frames_per_eg + else + frames_per_eg_list="$frames_per_eg_list,$this_frames_per_eg" + fi # check parameter dimension to be the same in all egs dirs for f in $check_params; do @@ -90,16 +109,18 @@ for lang in $(seq 0 $[$num_langs-1]);do done done +if [ ! -z "$lang2weight" ]; then + egs_opt="--lang2weight '$lang2weight'" +fi + if [ $stage -le 0 ]; then echo "$0: allocating multilingual examples for training." - if [ ! -z "$lang2weight" ]; then - egs_opt="--lang2weight '$lang2weight'" - fi - # Generate egs.*.scp for multilingual setup. + # Generate ${egs_prefix}*.scp for multilingual setup. $cmd $megs_dir/log/allocate_multilingual_examples_train.log \ steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \ - --minibatch-size $minibatch_size \ - --samples-per-iter $samples_per_iter \ + ${allocate_opts} --minibatch-size $minibatch_size \ + --frames-per-iter $frames_per_iter --frames-per-eg-list $frames_per_eg_list \ + --egs-prefix "$egs_prefix" \ $train_scp_list $megs_dir || exit 1; fi @@ -107,20 +128,20 @@ if [ $stage -le 1 ]; then echo "$0: combine combine.scp examples from all langs in $megs_dir/combine.scp." # Generate combine.scp for multilingual setup. $cmd $megs_dir/log/allocate_multilingual_examples_combine.log \ - steps/nnet3/multilingual/allocate_multilingual_examples.py \ - --random-lang false \ - --max-archives 1 --num-jobs 1 \ - --minibatch-size $minibatch_size \ + steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \ + --random-lang false --max-archives 1 --num-jobs 1 \ + --frames-per-eg-list $frames_per_eg_list \ + ${allocate_opts} --minibatch-size $minibatch_size \ --egs-prefix "combine." \ $combine_scp_list $megs_dir || exit 1; echo "$0: combine train_diagnostic.scp examples from all langs in $megs_dir/train_diagnostic.scp." # Generate train_diagnostic.scp for multilingual setup. $cmd $megs_dir/log/allocate_multilingual_examples_train_diagnostic.log \ - steps/nnet3/multilingual/allocate_multilingual_examples.py \ - --random-lang false \ - --max-archives 1 --num-jobs 1 \ - --minibatch-size $minibatch_size \ + steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \ + --random-lang false --max-archives 1 --num-jobs 1 \ + --frames-per-eg-list $frames_per_eg_list \ + ${allocate_opts} --minibatch-size $minibatch_size \ --egs-prefix "train_diagnostic." \ $train_diagnostic_scp_list $megs_dir || exit 1; @@ -128,9 +149,10 @@ if [ $stage -le 1 ]; then echo "$0: combine valid_diagnostic.scp examples from all langs in $megs_dir/valid_diagnostic.scp." # Generate valid_diagnostic.scp for multilingual setup. $cmd $megs_dir/log/allocate_multilingual_examples_valid_diagnostic.log \ - steps/nnet3/multilingual/allocate_multilingual_examples.py \ + steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \ --random-lang false --max-archives 1 --num-jobs 1\ - --minibatch-size $minibatch_size \ + --frames-per-eg-list $frames_per_eg_list \ + ${allocate_opts} --minibatch-size $minibatch_size \ --egs-prefix "valid_diagnostic." \ $valid_diagnostic_scp_list $megs_dir || exit 1; @@ -140,6 +162,6 @@ for egs_type in combine train_diagnostic valid_diagnostic; do mv $megs_dir/${egs_type}.weight.1.ark $megs_dir/${egs_type}.weight.ark || exit 1; mv $megs_dir/${egs_type}.1.scp $megs_dir/${egs_type}.scp || exit 1; done -mv $megs_dir/info/egs.num_archives $megs_dir/info/num_archives || exit 1; -mv $megs_dir/info/egs.num_tasks $megs_dir/info/num_tasks || exit 1; +mv $megs_dir/info/${egs_prefix}num_archives $megs_dir/info/num_archives || exit 1; +mv $megs_dir/info/${egs_prefix}num_tasks $megs_dir/info/num_tasks || exit 1; echo "$0: Finished preparing multilingual training example." diff --git a/egs/wsj/s5/steps/subset_ali_dir.sh b/egs/wsj/s5/steps/subset_ali_dir.sh new file mode 100755 index 00000000000..c086ea39959 --- /dev/null +++ b/egs/wsj/s5/steps/subset_ali_dir.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# Apache 2.0. + +cmd=run.pl + +. path.sh + +. utils/parse_options.sh + +if [ $# -ne 4 ]; then + cat < + e.g.: data/train data/train_sp exp/tri3_ali_sp exp/tri3_ali +EOF +fi + +subset_data=$1 +data=$2 +ali_dir=$3 +dir=$4 + +nj=$(cat $ali_dir/num_jobs) || exit 1 +utils/split_data.sh $data $nj + +mkdir -p $dir +cp $ali_dir/{final.mdl,*.mat,*_opts,tree} $dir/ || true +cp -r $ali_dir/phones $dir 2>/dev/null || true + +$cmd JOB=1:$nj $dir/log/copy_alignments.JOB.log \ + copy-int-vector "ark:gunzip -c $ali_dir/ali.JOB.gz |" \ + ark,scp:$dir/ali_tmp.JOB.ark,$dir/ali_tmp.JOB.scp || exit 1 + +for n in `seq $nj`; do + cat $dir/ali_tmp.$n.scp +done > $dir/ali_tmp.scp + +num_spk=$(cat $subset_data/spk2utt | wc -l) +if [ $num_spk -lt $nj ]; then + nj=$num_spk +fi + +utils/split_data.sh $subset_data $nj +$cmd JOB=1:$nj $dir/log/filter_alignments.JOB.log \ + copy-int-vector \ + "scp:utils/filter_scp.pl $subset_data/split${nj}/JOB/utt2spk $dir/ali_tmp.scp |" \ + "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1 + +echo $nj > $dir/num_jobs + +rm $dir/ali_tmp.*.{ark,scp} $dir/ali_tmp.scp + +exit 0 diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index b5597b15667..fb0f0284df7 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -19,6 +19,7 @@ #include "chain/chain-supervision.h" #include "lat/lattice-functions.h" +#include "lat/push-lattice.h" #include "util/text-utils.h" #include "hmm/hmm-utils.h" #include @@ -142,9 +143,9 @@ bool ProtoSupervision::operator == (const ProtoSupervision &other) const { fst::Equal(fst, other.fst)); } -bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, - const CompactLattice &lat, - ProtoSupervision *proto_supervision) { +bool PhoneLatticeToProtoSupervisionInternal(const SupervisionOptions &opts, + const CompactLattice &lat, + ProtoSupervision *proto_supervision) { opts.Check(); if (lat.NumStates() == 0) { KALDI_WARN << "Empty lattice provided"; @@ -176,9 +177,11 @@ bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, return false; } proto_supervision->fst.AddArc(state, - fst::StdArc(phone, phone, - fst::TropicalWeight::One(), - lat_arc.nextstate)); + fst::StdArc(phone, phone, + fst::TropicalWeight( + lat_arc.weight.Weight().Value1() + * opts.lm_scale + opts.phone_ins_penalty), + lat_arc.nextstate)); int32 t_begin = std::max(0, (state_time - opts.left_tolerance)), t_end = std::min(num_frames, (next_state_time + opts.right_tolerance)), @@ -189,7 +192,8 @@ bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, proto_supervision->allowed_phones[t_subsampled].push_back(phone); } if (lat.Final(state) != CompactLatticeWeight::Zero()) { - proto_supervision->fst.SetFinal(state, fst::TropicalWeight::One()); + proto_supervision->fst.SetFinal(state, fst::TropicalWeight( + lat.Final(state).Weight().Value1() * opts.lm_scale)); if (state_times[state] != num_frames) { KALDI_WARN << "Time of final state " << state << " in lattice is " << "not equal to number of frames " << num_frames @@ -207,6 +211,16 @@ bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, return true; } +bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, + const CompactLattice &lat, + ProtoSupervision *proto_supervision) { + if (!PhoneLatticeToProtoSupervisionInternal(opts, lat, proto_supervision)) + return false; + if (opts.lm_scale != 0.0) + fst::Push(&(proto_supervision->fst), + fst::REWEIGHT_TO_INITIAL, fst::kDelta, true); + return true; +} bool TimeEnforcerFst::GetArc(StateId s, Label ilabel, fst::StdArc* oarc) { // the following call will do the range-check on 'ilabel'. diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h index a94f68ade90..ce755f0cb63 100644 --- a/src/chain/chain-supervision.h +++ b/src/chain/chain-supervision.h @@ -50,10 +50,16 @@ struct SupervisionOptions { int32 left_tolerance; int32 right_tolerance; int32 frame_subsampling_factor; + BaseFloat weight; + BaseFloat lm_scale; + BaseFloat phone_ins_penalty; SupervisionOptions(): left_tolerance(5), right_tolerance(5), - frame_subsampling_factor(1) { } + frame_subsampling_factor(1), + weight(1.0), + lm_scale(0.0), + phone_ins_penalty(0.0) { } void Register(OptionsItf *opts) { opts->Register("left-tolerance", &left_tolerance, "Left tolerance for " @@ -65,6 +71,13 @@ struct SupervisionOptions { "frame-rate of the original alignment. Applied after " "left-tolerance and right-tolerance are applied (so they are " "in terms of the original num-frames."); + opts->Register("weight", &weight, + "Use this to set the supervision weight for training"); + opts->Register("lm-scale", &lm_scale, "The scale with which the graph/lm " + "weights from the phone lattice are included in the " + "supervision fst."); + opts->Register("phone-ins-penalty", &phone_ins_penalty, + "The penalty to penalize longer paths"); } void Check() const; }; diff --git a/src/chain/language-model.cc b/src/chain/language-model.cc index 41e06116ea8..d2bb073d764 100644 --- a/src/chain/language-model.cc +++ b/src/chain/language-model.cc @@ -26,7 +26,8 @@ namespace kaldi { namespace chain { -void LanguageModelEstimator::AddCounts(const std::vector &sentence) { +void LanguageModelEstimator::AddCounts(const std::vector &sentence, + int32 weight) { KALDI_ASSERT(opts_.ngram_order >= 2 && "--ngram-order must be >= 2"); KALDI_ASSERT(opts_.ngram_order >= opts_.no_prune_ngram_order); int32 order = opts_.ngram_order; @@ -36,23 +37,23 @@ void LanguageModelEstimator::AddCounts(const std::vector &sentence) { end = sentence.end(); for (; iter != end; ++iter) { KALDI_ASSERT(*iter != 0); - IncrementCount(history, *iter); + IncrementCount(history, *iter, weight); history.push_back(*iter); if (history.size() >= order) history.erase(history.begin()); } // Probability of end of sentence. This will end up getting ignored later, but // it still makes a difference for probability-normalization reasons. - IncrementCount(history, 0); + IncrementCount(history, 0, weight); } void LanguageModelEstimator::IncrementCount(const std::vector &history, - int32 next_phone) { + int32 next_phone, int32 weight) { int32 lm_state_index = FindOrCreateLmStateIndexForHistory(history); if (lm_states_[lm_state_index].tot_count == 0) { num_active_lm_states_++; } - lm_states_[lm_state_index].AddCount(next_phone, 1); + lm_states_[lm_state_index].AddCount(next_phone, weight); } void LanguageModelEstimator::SetParentCounts() { diff --git a/src/chain/language-model.h b/src/chain/language-model.h index b2c3f4cd746..123d5ab830f 100644 --- a/src/chain/language-model.h +++ b/src/chain/language-model.h @@ -91,7 +91,7 @@ class LanguageModelEstimator { // Adds counts for this sentence. Basically does: for each n-gram in the // sentence, count[n-gram] += 1. The only constraint on 'sentence' is that it // should contain no zeros. - void AddCounts(const std::vector &sentence); + void AddCounts(const std::vector &sentence, int32 weight); // Estimates the LM and outputs it as an FST. Note: there is // no concept here of backoff arcs. @@ -188,7 +188,7 @@ class LanguageModelEstimator { // adds the counts for this ngram (called from AddCounts()). inline void IncrementCount(const std::vector &history, - int32 next_phone); + int32 next_phone, int32 weight); // Computes whether backoff should be allowed for this lm_state. (the caller diff --git a/src/chainbin/chain-est-phone-lm.cc b/src/chainbin/chain-est-phone-lm.cc index f16b3f4f14b..db16cc4d51a 100644 --- a/src/chainbin/chain-est-phone-lm.cc +++ b/src/chainbin/chain-est-phone-lm.cc @@ -39,31 +39,52 @@ int main(int argc, char *argv[]) { " chain-est-phone-lm --leftmost-context-questions=dir/leftmost_questions.txt ark:- dir/phone_G.fst\n"; bool binary_write = true; + std::string scales_str; + LanguageModelOptions lm_opts; ParseOptions po(usage); po.Register("binary", &binary_write, "Write output in binary mode"); + po.Register("scales", &scales_str, "Comma-separated list of scales " + "for the different sources of phone sequences"); lm_opts.Register(&po); po.Read(argc, argv); - if (po.NumArgs() != 2) { + if (po.NumArgs() < 2) { po.PrintUsage(); exit(1); } - std::string phone_seqs_rspecifier = po.GetArg(1), - lm_fst_wxfilename = po.GetArg(2); - + int32 num_sources = po.NumArgs() - 1; + + std::string lm_fst_wxfilename = po.GetArg(po.NumArgs()); + + std::vector scales(num_sources, 1); + if (!scales_str.empty()) { + std::vector parts; + SplitStringToVector(scales_str, ":,", false, &parts); + if (parts.size() != num_sources) { + KALDI_ERR << "--scales must have exactly num-sources = " + << num_sources << " scales."; + } + for (size_t i = 0; i < parts.size(); i++) { + scales[i] = std::atoi(parts[i].c_str()); + } + } LanguageModelEstimator lm_estimator(lm_opts); - SequentialInt32VectorReader phones_reader(phone_seqs_rspecifier); - KALDI_LOG << "Reading phone sequences"; - for (; !phones_reader.Done(); phones_reader.Next()) { - const std::vector &phone_seq = phones_reader.Value(); - lm_estimator.AddCounts(phone_seq); + for (int32 n = 1; n <= num_sources; n++) { + std::string phone_seqs_rspecifier = po.GetArg(n); + SequentialInt32VectorReader phones_reader(phone_seqs_rspecifier); + KALDI_LOG << "Reading phone sequences"; + for (; !phones_reader.Done(); phones_reader.Next()) { + const std::vector &phone_seq = phones_reader.Value(); + lm_estimator.AddCounts(phone_seq, scales[n-1]); + } } + KALDI_LOG << "Estimating phone LM"; fst::StdVectorFst fst; lm_estimator.Estimate(&fst); diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc index 4f26e145ac5..c6f643bcae7 100644 --- a/src/chainbin/nnet3-chain-copy-egs.cc +++ b/src/chainbin/nnet3-chain-copy-egs.cc @@ -25,6 +25,40 @@ namespace kaldi { namespace nnet3 { +// rename name of NnetIo with old_name to new_name. +void RenameIoNames(const std::string &old_name, + const std::string &new_name, + NnetChainExample *eg_modified) { + // list of io-names in eg_modified. + std::vector orig_output_names; + int32 output_size = eg_modified->outputs.size(); + for (int32 output_ind = 0; output_ind < output_size; output_ind++) + orig_output_names.push_back(eg_modified->outputs[output_ind].name); + + // find the io in eg with name 'old_name'. + int32 rename_output_ind = + std::find(orig_output_names.begin(), orig_output_names.end(), old_name) - + orig_output_names.begin(); + + if (rename_output_ind >= output_size) + KALDI_ERR << "No io-node with name " << old_name + << "exists in eg."; + eg_modified->outputs[rename_output_ind].name = new_name; +} + +// ranames NnetIo name with name 'output' to new_output_name +// and scales the supervision for 'output' using weight. +void SetWeightAndRenameOutput(BaseFloat weight, + const std::string &new_output_name, + NnetChainExample *eg) { + // scale the supervision weight for egs + for (int32 i = 0; i < eg->outputs.size(); i++) + if (eg->outputs[i].name == "output") + if (weight != 0.0 && weight != 1.0) + eg->outputs[i].supervision.weight *= weight; + // rename output io name to 'new_output_name'. + RenameIoNames("output", new_output_name, eg); +} // returns an integer randomly drawn with expected value "expected_count" // (will be either floor(expected_count) or ceil(expected_count)). @@ -268,6 +302,8 @@ int main(int argc, char *argv[]) { int32 frame_subsampling_factor = -1; BaseFloat keep_proportion = 1.0; int32 left_context = -1, right_context = -1; + std::string eg_weight_rspecifier, eg_output_rspecifier; + ParseOptions po(usage); po.Register("random", &random, "If true, will write frames to output " "archives randomly, not round-robin."); @@ -285,6 +321,15 @@ int main(int argc, char *argv[]) { "feature left-context that we output."); po.Register("right-context", &right_context, "Can be used to truncate the " "feature right-context that we output."); + po.Register("weights", &eg_weight_rspecifier, + "Rspecifier indexed by the key of egs, providing a weight by " + "which we will scale the supervision matrix for that eg. " + "Used in multilingual training."); + po.Register("outputs", &eg_output_rspecifier, + "Rspecifier indexed by the key of egs, providing a string-valued " + "output name, e.g. 'output-0'. If provided, the NnetIo with " + "name 'output' will be renamed to the provided name. Used in " + "multilingual training."); po.Read(argc, argv); srand(srand_seed); @@ -297,6 +342,8 @@ int main(int argc, char *argv[]) { std::string examples_rspecifier = po.GetArg(1); SequentialNnetChainExampleReader example_reader(examples_rspecifier); + RandomAccessTokenReader output_reader(eg_output_rspecifier); + RandomAccessBaseFloatReader egs_weight_reader(eg_weight_rspecifier); int32 num_outputs = po.NumArgs() - 1; std::vector example_writers(num_outputs); @@ -307,8 +354,9 @@ int main(int argc, char *argv[]) { // not configurable for now. exclude_names.push_back(std::string("ivector")); - int64 num_read = 0, num_written = 0; - + int64 num_read = 0, num_written = 0, num_err = 0; + bool modify_eg_output = !(eg_output_rspecifier.empty() && + eg_weight_rspecifier.empty()); for (; !example_reader.Done(); example_reader.Next(), num_read++) { if (frame_subsampling_factor == -1) CalculateFrameSubsamplingFactor(example_reader.Value(), @@ -316,11 +364,41 @@ int main(int argc, char *argv[]) { // count is normally 1; could be 0, or possibly >1. int32 count = GetCount(keep_proportion); std::string key = example_reader.Key(); + NnetChainExample eg_modified_output; + const NnetChainExample &eg_orig = example_reader.Value(), + &eg = (modify_eg_output ? eg_modified_output : eg_orig); + // Note: in the normal case we just use 'eg'; eg_modified_output is + // for the case when the --outputs or --weights option is specified + // (only for multilingual training). + BaseFloat weight = 1.0; + std::string new_output_name; + if (modify_eg_output) { // This branch is only taken for multilingual training. + eg_modified_output = eg_orig; + if (!eg_weight_rspecifier.empty()) { + if (!egs_weight_reader.HasKey(key)) { + KALDI_WARN << "No weight for example key " << key; + num_err++; + continue; + } + weight = egs_weight_reader.Value(key); + } + if (!eg_output_rspecifier.empty()) { + if (!output_reader.HasKey(key)) { + KALDI_WARN << "No new output-name for example key " << key; + num_err++; + continue; + } + new_output_name = output_reader.Value(key); + } + } if (frame_shift == 0 && left_context == -1 && right_context == -1) { - const NnetChainExample &eg = example_reader.Value(); for (int32 c = 0; c < count; c++) { int32 index = (random ? Rand() : num_written) % num_outputs; + if (modify_eg_output) // Only for multilingual training + SetWeightAndRenameOutput(weight, new_output_name, + &eg_modified_output); + example_writers[index]->Write(key, eg); num_written++; } @@ -336,6 +414,8 @@ int main(int argc, char *argv[]) { eg_out.Swap(&eg); for (int32 c = 0; c < count; c++) { int32 index = (random ? Rand() : num_written) % num_outputs; + if (modify_eg_output) + SetWeightAndRenameOutput(weight, new_output_name, &eg_out); example_writers[index]->Write(key, eg_out); num_written++; } diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc index c8c251900ec..b644ba0aa01 100644 --- a/src/chainbin/nnet3-chain-get-egs.cc +++ b/src/chainbin/nnet3-chain-get-egs.cc @@ -43,6 +43,8 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, const MatrixBase *ivector_feats, int32 ivector_period, const chain::Supervision &supervision, + const VectorBase *deriv_weights, + int32 supervision_length_tolerance, const std::string &utt_id, bool compress, UtteranceSplitter *utt_splitter, @@ -51,7 +53,18 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, int32 num_input_frames = feats.NumRows(), num_output_frames = supervision.frames_per_sequence; - if (!utt_splitter->LengthsMatch(utt_id, num_input_frames, num_output_frames)) + int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor; + + if (deriv_weights && (std::abs(deriv_weights->Dim() - num_output_frames) + > supervision_length_tolerance)) { + KALDI_WARN << "For utterance " << utt_id + << ", mismatch between deriv-weights dim and num-output-frames" + << "; " << deriv_weights->Dim() << " vs " << num_output_frames; + return false; + } + + if (!utt_splitter->LengthsMatch(utt_id, num_input_frames, num_output_frames, + supervision_length_tolerance)) return false; // LengthsMatch() will have printed a warning. std::vector chunks; @@ -65,8 +78,6 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, return false; } - int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor; - chain::SupervisionSplitter sup_splitter(supervision); for (size_t c = 0; c < chunks.size(); c++) { @@ -92,19 +103,36 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, int32 first_frame = 0; // we shift the time-indexes of all these parts so // that the supervised part starts from frame 0. + + NnetChainExample nnet_chain_eg; + nnet_chain_eg.outputs.resize(1); SubVector output_weights( &(chunk.output_weights[0]), static_cast(chunk.output_weights.size())); - NnetChainSupervision nnet_supervision("output", supervision_part, - output_weights, - first_frame, - frame_subsampling_factor); + if (!deriv_weights) { + NnetChainSupervision nnet_supervision("output", supervision_part, + output_weights, + first_frame, + frame_subsampling_factor); + nnet_chain_eg.outputs[0].Swap(&nnet_supervision); + } else { + Vector this_deriv_weights(num_frames_subsampled); + for (int32 i = 0; i < num_frames_subsampled; i++) { + int32 t = i + start_frame_subsampled; + if (t < deriv_weights->Dim()) + this_deriv_weights(i) = (*deriv_weights)(t); + } + KALDI_ASSERT(output_weights.Dim() == num_frames_subsampled); + this_deriv_weights.MulElements(output_weights); + NnetChainSupervision nnet_supervision("output", supervision_part, + this_deriv_weights, + first_frame, + frame_subsampling_factor); + nnet_chain_eg.outputs[0].Swap(&nnet_supervision); + } - NnetChainExample nnet_chain_eg; - nnet_chain_eg.outputs.resize(1); - nnet_chain_eg.outputs[0].Swap(&nnet_supervision); nnet_chain_eg.inputs.resize(ivector_feats != NULL ? 2 : 1); int32 tot_input_frames = chunk.left_context + chunk.num_frames + @@ -176,13 +204,15 @@ int main(int argc, char *argv[]) { "chain-get-supervision.\n"; bool compress = true; - int32 length_tolerance = 100, online_ivector_period = 1; + int32 length_tolerance = 100, online_ivector_period = 1, + supervision_length_tolerance = 1; ExampleGenerationConfig eg_config; // controls num-frames, // left/right-context, etc. + BaseFloat scale = 1.0; int32 srand_seed = 0; - std::string online_ivector_rspecifier; + std::string online_ivector_rspecifier, deriv_weights_rspecifier; ParseOptions po(usage); po.Register("compress", &compress, "If true, write egs with input features " @@ -200,6 +230,16 @@ int main(int argc, char *argv[]) { po.Register("srand", &srand_seed, "Seed for random number generator "); po.Register("length-tolerance", &length_tolerance, "Tolerance for " "difference in num-frames between feat and ivector matrices"); + po.Register("supervision-length-tolerance", &supervision_length_tolerance, "Tolerance for " + "difference in num-frames-subsampled between supervision and deriv weights"); + po.Register("deriv-weights-rspecifier", &deriv_weights_rspecifier, + "Per-frame weights (only binary - 0 or 1) that specifies " + "whether a frame's gradient must be backpropagated or not. " + "Not specifying this is equivalent to specifying a vector of " + "all 1s."); + po.Register("normalization-scale", &scale, "Scale the weights from the " + "'normalization' FST before applying them to the examples."); + eg_config.Register(&po); po.Read(argc, argv); @@ -235,6 +275,14 @@ int main(int argc, char *argv[]) { if (!normalization_fst_rxfilename.empty()) { ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst); KALDI_ASSERT(normalization_fst.NumStates() > 0); + + if (scale <= 0.0) { + KALDI_ERR << "Invalid scale on normalization FST; must be > 0.0"; + } + + if (scale != 1.0) { + ScaleFst(scale, &normalization_fst); + } } // Read as GeneralMatrix so we don't need to un-compress and re-compress @@ -245,6 +293,8 @@ int main(int argc, char *argv[]) { NnetChainExampleWriter example_writer(examples_wspecifier); RandomAccessBaseFloatMatrixReader online_ivector_reader( online_ivector_rspecifier); + RandomAccessBaseFloatVectorReader deriv_weights_reader( + deriv_weights_rspecifier); int32 num_err = 0; @@ -278,10 +328,24 @@ int main(int argc, char *argv[]) { num_err++; continue; } + + const Vector *deriv_weights = NULL; + if (!deriv_weights_rspecifier.empty()) { + if (!deriv_weights_reader.HasKey(key)) { + KALDI_WARN << "No deriv weights for utterance " << key; + num_err++; + continue; + } else { + // this address will be valid until we call HasKey() or Value() + // again. + deriv_weights = &(deriv_weights_reader.Value(key)); + } + } if (!ProcessFile(normalization_fst, feats, online_ivector_feats, online_ivector_period, - supervision, key, compress, + supervision, deriv_weights, supervision_length_tolerance, + key, compress, &utt_splitter, &example_writer)) num_err++; } diff --git a/src/chainbin/nnet3-chain-normalize-egs.cc b/src/chainbin/nnet3-chain-normalize-egs.cc index 9d3f56f756a..139c08e7799 100644 --- a/src/chainbin/nnet3-chain-normalize-egs.cc +++ b/src/chainbin/nnet3-chain-normalize-egs.cc @@ -41,7 +41,11 @@ int main(int argc, char *argv[]) { "e.g.\n" "nnet3-chain-normalize-egs dir/normalization.fst ark:train_in.cegs ark:train_out.cegs\n"; + BaseFloat scale = 1.0; + ParseOptions po(usage); + po.Register("normalization-scale", &scale, "Scale the weights from the " + "'normalization' FST before applying them to the examples."); po.Read(argc, argv); @@ -57,6 +61,14 @@ int main(int argc, char *argv[]) { fst::StdVectorFst normalization_fst; ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst); + if (scale <= 0.0) { + KALDI_ERR << "Invalid scale on normalization FST; must be > 0.0"; + } + + if (scale != 1.0) { + ScaleFst(scale, &normalization_fst); + } + SequentialNnetChainExampleReader example_reader(examples_rspecifier); NnetChainExampleWriter example_writer(examples_wspecifier); diff --git a/src/nnet3/nnet-chain-combine.cc b/src/nnet3/nnet-chain-combine.cc index c93858fb06e..67de2b843bb 100644 --- a/src/nnet3/nnet-chain-combine.cc +++ b/src/nnet3/nnet-chain-combine.cc @@ -503,18 +503,18 @@ double NnetChainCombiner::ComputeObjfAndDerivFromNnet( prob_computer_->Reset(); std::vector::const_iterator iter = egs_.begin(), end = egs_.end(); - for (; iter != end; ++iter) + for (; iter != end; ++iter) { prob_computer_->Compute(*iter); - const ChainObjectiveInfo *objf_info = - prob_computer_->GetObjective("output"); - if (objf_info == NULL) - KALDI_ERR << "Error getting objective info (unsuitable egs?)"; - KALDI_ASSERT(objf_info->tot_weight > 0.0); + } + + std::pair p = prob_computer_->GetTotalObjective(); + BaseFloat tot_objf = p.first, tot_weight = p.second; + KALDI_ASSERT(tot_weight > 0.0); const Nnet &deriv = prob_computer_->GetDeriv(); VectorizeNnet(deriv, nnet_params_deriv); // we prefer to deal with normalized objective functions. - nnet_params_deriv->Scale(1.0 / objf_info->tot_weight); - return (objf_info->tot_like + objf_info->tot_l2_term) / objf_info->tot_weight; + nnet_params_deriv->Scale(1.0 / tot_weight); + return tot_objf / tot_weight; } diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index 084b33347df..cd3d5894601 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -207,6 +207,26 @@ bool NnetChainComputeProb::PrintTotalStats() const { } +std::pair NnetChainComputeProb::GetTotalObjective() const { + unordered_map::const_iterator + iter, end; + iter = objf_info_.begin(); + end = objf_info_.end(); + BaseFloat tot_objf = 0.0, tot_weight = 0.0; + for (; iter != end; ++iter) { + const std::string &name = iter->first; + int32 node_index = nnet_.GetNodeIndex(name); + KALDI_ASSERT(node_index >= 0); + const ChainObjectiveInfo &info = iter->second; + BaseFloat like = (info.tot_like / info.tot_weight), + l2_term = (info.tot_l2_term / info.tot_weight); + tot_objf += like + l2_term; + tot_weight += info.tot_weight; + } + return std::make_pair(tot_objf, tot_weight); +} + + const ChainObjectiveInfo* NnetChainComputeProb::GetObjective( const std::string &output_name) const { unordered_map::const_iterator @@ -217,15 +237,29 @@ const ChainObjectiveInfo* NnetChainComputeProb::GetObjective( return NULL; } +static bool HasXentOutputs(const Nnet &nnet) { + const std::vector node_names = nnet.GetNodeNames(); + for (std::vector::const_iterator it = node_names.begin(); + it != node_names.end(); ++it) { + int32 node_index = nnet.GetNodeIndex(*it); + if (nnet.IsOutputNode(node_index) && + it->find("-xent") != std::string::npos) { + return true; + } + } + return false; +} + void RecomputeStats(const std::vector &egs, const chain::ChainTrainingOptions &chain_config_in, const fst::StdVectorFst &den_fst, Nnet *nnet) { KALDI_LOG << "Recomputing stats on nnet (affects batch-norm)"; chain::ChainTrainingOptions chain_config(chain_config_in); - if (nnet->GetNodeIndex("output-xent") != -1 && + if (HasXentOutputs(*nnet) && chain_config.xent_regularize == 0) { - // this forces it to compute the output for 'output-xent', which + // this forces it to compute the output for xent outputs, + // usually 'output-xent', which // means that we'll be computing batch-norm stats for any // components in that branch that have batch-norm. chain_config.xent_regularize = 0.1; diff --git a/src/nnet3/nnet-chain-diagnostics.h b/src/nnet3/nnet-chain-diagnostics.h index 4125427c463..b2962cf87d3 100644 --- a/src/nnet3/nnet-chain-diagnostics.h +++ b/src/nnet3/nnet-chain-diagnostics.h @@ -83,6 +83,9 @@ class NnetChainComputeProb { // or NULL if there is no such info. const ChainObjectiveInfo *GetObjective(const std::string &output_name) const; + // returns the total objective summed over all the outputs + std::pair GetTotalObjective() const; + // if config.compute_deriv == true, returns a reference to the // computed derivative. Otherwise crashes. const Nnet &GetDeriv() const; diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc index 351312fb952..d40df1a79f9 100644 --- a/src/nnet3/nnet-chain-example.cc +++ b/src/nnet3/nnet-chain-example.cc @@ -31,8 +31,8 @@ void NnetChainSupervision::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, name); WriteIndexVector(os, binary, indexes); supervision.Write(os, binary); - WriteToken(os, binary, ""); // for DerivWeights. Want to save space. - WriteVectorAsChar(os, binary, deriv_weights); + WriteToken(os, binary, ""); // for DerivWeights. Want to save space. + deriv_weights.Write(os, binary); WriteToken(os, binary, ""); } @@ -51,8 +51,11 @@ void NnetChainSupervision::Read(std::istream &is, bool binary) { ReadToken(is, binary, &token); // in the future this back-compatibility code can be reworked. if (token != "") { - KALDI_ASSERT(token == ""); - ReadVectorAsChar(is, binary, &deriv_weights); + KALDI_ASSERT(token == "" || token == ""); + if (token == "") + ReadVectorAsChar(is, binary, &deriv_weights); + else + deriv_weights.Read(is, binary); ExpectToken(is, binary, ""); } CheckDim(); @@ -82,8 +85,7 @@ void NnetChainSupervision::CheckDim() const { } if (deriv_weights.Dim() != 0) { KALDI_ASSERT(deriv_weights.Dim() == indexes.size()); - KALDI_ASSERT(deriv_weights.Min() >= 0.0 && - deriv_weights.Max() <= 1.0); + KALDI_ASSERT(deriv_weights.Min() >= 0.0); } } diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc index 65df0c891c1..5a0eebd9e9a 100644 --- a/src/nnet3/nnet-example-utils.cc +++ b/src/nnet3/nnet-example-utils.cc @@ -1265,6 +1265,21 @@ void ExampleMerger::Finish() { stats_.PrintStats(); } +void ScaleFst(BaseFloat scale, fst::StdVectorFst *fst) { + typedef fst::StdArc Arc; + typedef Arc::StateId StateId; + typedef Arc::Weight Weight; + + for (StateId s = 0; s < fst->NumStates(); s++) { + for (fst::MutableArcIterator aiter(fst, s); + !aiter.Done(); aiter.Next()) { + Arc arc = aiter.Value(); + Weight weight(arc.weight.Value() * scale); + arc.weight = weight; + aiter.SetValue(arc); + } + } +} } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h index 02620df7485..3dcd90eb980 100644 --- a/src/nnet3/nnet-example-utils.h +++ b/src/nnet3/nnet-example-utils.h @@ -516,7 +516,7 @@ class ExampleMerger { MapType eg_to_egs_; }; - +void ScaleFst(BaseFloat scale, fst::StdVectorFst *fst); } // namespace nnet3 } // namespace kaldi