From f21979879d9b4d9f79b384270e8c7e780b76aebd Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Fri, 5 Jan 2018 00:59:47 +0330 Subject: [PATCH] [egs] Add OCR/Handwriting Recognition examples (#1984) * OCR: Add IAM corpus with unk decoding support (#3) * Add a new English OCR database 'UW3' * Some minor fixes re IAM corpus * Fix an issue in IAM chain recipes + add a new recipe (#6) * Some fixes based on the pull request review * Various fixes + cleaning on IAM * Fix LM estimation and add extended dictionary + other minor fixes * Add README for IAM * Add output filter for scoring * Fix a bug RE switch to pyhton3 * Add updated results + minor fixes * Remove unk decoding -- gives almost no gain * Add UW3 OCR database * Fix cmd.sh in IAM + fix usages of train/decode_cmd in chain recipes * Various minor fixes on UW3 * Rename iam/s5 to iam/v1 * Add README file for UW3 * Various cosmetic fixes on UW3 scripts * Minor fixes in IAM --- egs/iam/README.txt | 4 + egs/iam/v1/cmd.sh | 13 + egs/iam/v1/image | 1 + egs/iam/v1/local/chain/compare_wer.sh | 59 +++++ egs/iam/v1/local/chain/run_cnn_1a.sh | 235 +++++++++++++++++ egs/iam/v1/local/chain/run_cnn_chainali_1a.sh | 244 +++++++++++++++++ egs/iam/v1/local/chain/run_cnn_chainali_1b.sh | 245 ++++++++++++++++++ egs/iam/v1/local/make_features.py | 87 +++++++ egs/iam/v1/local/prepare_data.sh | 149 +++++++++++ egs/iam/v1/local/prepare_dict.sh | 49 ++++ egs/iam/v1/local/process_data.py | 82 ++++++ egs/iam/v1/local/score.sh | 5 + egs/iam/v1/local/train_lm.sh | 139 ++++++++++ egs/iam/v1/local/wer_output_filter | 27 ++ egs/iam/v1/path.sh | 6 + egs/iam/v1/run.sh | 122 +++++++++ egs/iam/v1/steps | 1 + egs/iam/v1/utils | 1 + egs/uw3/README.txt | 4 + egs/uw3/v1/cmd.sh | 13 + egs/uw3/v1/image | 1 + egs/uw3/v1/local/chain/compare_wer.sh | 72 +++++ egs/uw3/v1/local/chain/run_cnn_1a.sh | 234 +++++++++++++++++ egs/uw3/v1/local/make_features.py | 97 +++++++ egs/uw3/v1/local/prepare_data.sh | 40 +++ egs/uw3/v1/local/prepare_dict.sh | 29 +++ egs/uw3/v1/local/process_data.py | 61 +++++ egs/uw3/v1/local/score.sh | 156 +++++++++++ egs/uw3/v1/local/train_lm.sh | 102 ++++++++ .../v1/local/unk_arc_post_to_transcription.py | 86 ++++++ egs/uw3/v1/path.sh | 6 + egs/uw3/v1/run.sh | 106 ++++++++ egs/uw3/v1/steps | 1 + egs/uw3/v1/utils | 1 + 34 files changed, 2478 insertions(+) create mode 100644 egs/iam/README.txt create mode 100644 egs/iam/v1/cmd.sh create mode 120000 egs/iam/v1/image create mode 100755 egs/iam/v1/local/chain/compare_wer.sh create mode 100755 egs/iam/v1/local/chain/run_cnn_1a.sh create mode 100755 egs/iam/v1/local/chain/run_cnn_chainali_1a.sh create mode 100755 egs/iam/v1/local/chain/run_cnn_chainali_1b.sh create mode 100755 egs/iam/v1/local/make_features.py create mode 100755 egs/iam/v1/local/prepare_data.sh create mode 100755 egs/iam/v1/local/prepare_dict.sh create mode 100755 egs/iam/v1/local/process_data.py create mode 100755 egs/iam/v1/local/score.sh create mode 100755 egs/iam/v1/local/train_lm.sh create mode 100755 egs/iam/v1/local/wer_output_filter create mode 100755 egs/iam/v1/path.sh create mode 100755 egs/iam/v1/run.sh create mode 120000 egs/iam/v1/steps create mode 120000 egs/iam/v1/utils create mode 100644 egs/uw3/README.txt create mode 100644 egs/uw3/v1/cmd.sh create mode 120000 egs/uw3/v1/image create mode 100755 egs/uw3/v1/local/chain/compare_wer.sh create mode 100755 egs/uw3/v1/local/chain/run_cnn_1a.sh create mode 100755 egs/uw3/v1/local/make_features.py create mode 100755 egs/uw3/v1/local/prepare_data.sh create mode 100755 egs/uw3/v1/local/prepare_dict.sh create mode 100755 egs/uw3/v1/local/process_data.py create mode 100755 egs/uw3/v1/local/score.sh create mode 100755 egs/uw3/v1/local/train_lm.sh create mode 100755 egs/uw3/v1/local/unk_arc_post_to_transcription.py create mode 100755 egs/uw3/v1/path.sh create mode 100755 egs/uw3/v1/run.sh create mode 120000 egs/uw3/v1/steps create mode 120000 egs/uw3/v1/utils diff --git a/egs/iam/README.txt b/egs/iam/README.txt new file mode 100644 index 00000000000..daeb67af541 --- /dev/null +++ b/egs/iam/README.txt @@ -0,0 +1,4 @@ + +This directory contains example scripts for handwriting recognition on +the IAM dataset: +http://www.fki.inf.unibe.ch/databases/iam-handwriting-database diff --git a/egs/iam/v1/cmd.sh b/egs/iam/v1/cmd.sh new file mode 100644 index 00000000000..3c8eb9f93a5 --- /dev/null +++ b/egs/iam/v1/cmd.sh @@ -0,0 +1,13 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export cmd="queue.pl" diff --git a/egs/iam/v1/image b/egs/iam/v1/image new file mode 120000 index 00000000000..1668ee99922 --- /dev/null +++ b/egs/iam/v1/image @@ -0,0 +1 @@ +../../cifar/v1/image/ \ No newline at end of file diff --git a/egs/iam/v1/local/chain/compare_wer.sh b/egs/iam/v1/local/chain/compare_wer.sh new file mode 100755 index 00000000000..4eb665fc702 --- /dev/null +++ b/egs/iam/v1/local/chain/compare_wer.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/iam/v1/local/chain/run_cnn_1a.sh b/egs/iam/v1/local/chain/run_cnn_1a.sh new file mode 100755 index 00000000000..3b1571091c1 --- /dev/null +++ b/egs/iam/v1/local/chain/run_cnn_1a.sh @@ -0,0 +1,235 @@ +#!/bin/bash + +# Copyright 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +# steps/info/chain_dir_info.pl exp/chain/cnn_1a/ +# exp/chain/cnn_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.021->-0.015 xent:train/valid[13,20,final]=(-1.05,-0.701,-0.591/-1.30,-1.08,-1.00) logprob:train/valid[13,20,final]=(-0.061,-0.034,-0.030/-0.107,-0.101,-0.098) + +# cat exp/chain/cnn_1a/decode_test/scoring_kaldi/best_* +# %WER 5.94 [ 3913 / 65921, 645 ins, 1466 del, 1802 sub ] exp/chain/cnn_1a/decode_test//cer_11_0.0 +# %WER 9.13 [ 1692 / 18542, 162 ins, 487 del, 1043 sub ] exp/chain/cnn_1a/decode_test/wer_11_0.0 + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +alignment_subsampling_factor=1 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=false +lang_test=lang_test +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj $nj --cmd "$cmd" ${train_data_dir} \ + data/$lang_test $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + common1="height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="height-offsets=-2,-1,0,1,2 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn4 dim=$tdnn_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=$frame_subsampling_factor \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh new file mode 100755 index 00000000000..2c8b6c91e5a --- /dev/null +++ b/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh @@ -0,0 +1,244 @@ +#!/bin/bash + +# chainali_1a is as 1a except it uses chain alignments (using 1a system) instead of gmm alignments + +# ./local/chain/compare_wer.sh exp/chain/cnn_chainali_1a/ exp/chain/cnn_1a/ +# System cnn_chainali_1a cnn_1a +# WER 6.69 9.13 +# Final train prob -0.0128 -0.0297 +# Final valid prob -0.0447 -0.0975 +# Final train prob (xent) -0.6448 -0.5915 +# Final valid prob (xent) -0.9924 -1.0022 + +# steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1a/ +# exp/chain/cnn_chainali_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.002->0.000 xent:train/valid[13,20,final]=(-0.929,-0.711,-0.645/-1.16,-1.04,-0.992) logprob:train/valid[13,20,final]=(-0.029,-0.016,-0.013/-0.051,-0.047,-0.045) + +# cat exp/chain/cnn_chainali_1a/decode_test/scoring_kaldi/best_* +# %WER 3.94 [ 2600 / 65921, 549 ins, 837 del, 1214 sub ] exp/chain/cnn_chainali_1a/decode_test/cer_15_0.0 +# %WER 6.69 [ 1241 / 18542, 135 ins, 358 del, 748 sub ] exp/chain/cnn_chainali_1a/decode_test/wer_15_0.5 + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +chain_model_dir=exp/chain${nnet3_affix}/cnn${affix} +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +alignment_subsampling_factor=1 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=false +lang_test=lang_test +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/$lang_test $chain_model_dir $lat_dir + cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + common1="height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn4 dim=$tdnn_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=$alignment_subsampling_factor \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh new file mode 100755 index 00000000000..ddf596a6126 --- /dev/null +++ b/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh @@ -0,0 +1,245 @@ +#!/bin/bash + +# chainali_1b is as chainali_1a except it has 3 more cnn layers and 1 less tdnn layer. +# ./local/chain/compare_wer.sh exp/chain/cnn_chainali_1a/ exp/chain/cnn_chainali_1b/ +# System cnn_chainali_1a cnn_chainali_1b +# WER 6.69 6.25 +# Final train prob -0.0132 -0.0041 +# Final valid prob -0.0509 -0.0337 +# Final train prob (xent) -0.6393 -0.6287 +# Final valid prob (xent) -1.0116 -0.9064 + +# steps/info/chain_dir_info.pl exp/chain/chainali_cnn_1b/ +# exp/chain/chainali_cnn_1b/: num-iters=21 nj=2..4 num-params=4.0M dim=40->364 combine=-0.009->-0.005 xent:train/valid[13,20,final]=(-1.47,-0.728,-0.623/-1.69,-1.02,-0.940) logprob:train/valid[13,20,final]=(-0.068,-0.030,-0.011/-0.086,-0.056,-0.038) + +# cat exp/chain/cnn_chainali_1b/decode_test/scoring_kaldi/best_* +# %WER 3.94 [ 2600 / 65921, 415 ins, 1285 del, 900 sub ] exp/chain/cnn_chainali_1b/decode_test/cer_10_0.0 +# %WER 6.25 [ 1158 / 18542, 103 ins, 469 del, 586 sub ] exp/chain/cnn_chainali_1b/decode_test/wer_12_0.0 + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +chain_model_dir=exp/chain${nnet3_affix}/cnn${affix} +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +alignment_subsampling_factor=1 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=false +lang_test=lang_test +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/$lang_test $chain_model_dir $lat_dir + cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=$alignment_subsampling_factor \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/iam/v1/local/make_features.py b/egs/iam/v1/local/make_features.py new file mode 100755 index 00000000000..b998464953f --- /dev/null +++ b/egs/iam/v1/local/make_features.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +""" This script converts images to Kaldi-format feature matrices. The input to + this script is the path to a data directory, e.g. "data/train". This script + reads the images listed in images.scp and writes them to standard output + (by default) as Kaldi-formatted matrices (in text form). It also scales the + images so they have the same height (via --feat-dim). It can optionally pad + the images (on left/right sides) with white pixels. + + eg. local/make_features.py data/train --feat-dim 40 +""" + +import argparse +import os +import sys +import numpy as np +from scipy import misc + +parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and + writes them to standard output in text format.""") +parser.add_argument('dir', type=str, + help='Source data directory (containing images.scp)') +parser.add_argument('--out-ark', type=str, default='-', + help='Where to write the output feature file') +parser.add_argument('--feat-dim', type=int, default=40, + help='Size to scale the height of all images') +parser.add_argument('--padding', type=int, default=5, + help='Number of white pixels to pad on the left' + 'and right side of the image.') +args = parser.parse_args() + + +def write_kaldi_matrix(file_handle, matrix, key): + file_handle.write(key + " [ ") + num_rows = len(matrix) + if num_rows == 0: + raise Exception("Matrix is empty") + num_cols = len(matrix[0]) + + for row_index in range(len(matrix)): + if num_cols != len(matrix[row_index]): + raise Exception("All the rows of a matrix are expected to " + "have the same length") + file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + if row_index != num_rows - 1: + file_handle.write("\n") + file_handle.write(" ]\n") + +def get_scaled_image(im): + scale_size = args.feat_dim + sx = im.shape[1] + sy = im.shape[0] + scale = (1.0 * scale_size) / sy + nx = int(scale_size) + ny = int(scale * sx) + im = misc.imresize(im, (nx, ny)) + padding_x = args.padding + padding_y = im.shape[0] + im_pad = np.concatenate((255 * np.ones((padding_y, padding_x), + dtype=int), im), axis=1) + im_pad1 = np.concatenate((im_pad, 255 * np.ones((padding_y, padding_x), + dtype=int)), axis=1) + return im_pad1 + +### main ### +data_list_path = os.path.join(args.dir,'images.scp') + +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark,'wb') + +with open(data_list_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + image_id = line_vect[0] + image_path = line_vect[1] + im = misc.imread(image_path) + im_scale = get_scaled_image(im) + + data = np.transpose(im_scale, (1, 0)) + data = np.divide(data, 255.0) + write_kaldi_matrix(out_fh, data, image_id) diff --git a/egs/iam/v1/local/prepare_data.sh b/egs/iam/v1/local/prepare_data.sh new file mode 100755 index 00000000000..1350c5841df --- /dev/null +++ b/egs/iam/v1/local/prepare_data.sh @@ -0,0 +1,149 @@ +#!/bin/bash + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 + +# This script downloads the IAM handwriting database and prepares the training +# and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py. +# It also downloads the LOB and Brown text corpora. It downloads the database files +# only if they do not already exist in download directory. + +# Eg. local/prepare_data.sh +# Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from +# utt2spk file: 000_a01-000u-00 000 +# images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png +# spk2utt file: 000 000_a01-000u-00 000_a01-000u-01 000_a01-000u-02 000_a01-000u-03 + +stage=0 +download_dir=data/download +username= +password= # username and password for downloading the IAM database + # if you have not already downloaded the database, please + # register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database + # and provide this script with your username and password. + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +if [[ ! -f $download_dir/lines.tgz && -z $username ]]; then + echo "$0: Warning: Couldn't find lines.tgz in $download_dir. Unless the extracted dataset files" + echo "exist in your data/local directory this script will fail because the required files" + echo "can't be downloaded automatically (it needs registration)." + echo "Please register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database" + echo "... and then call this script again with --username --password " + echo "" +fi + +lines=data/local/lines +xml=data/local/xml +ascii=data/local/ascii +bcorpus=data/local/browncorpus +lobcorpus=data/local/lobcorpus +data_split_info=data/local/largeWriterIndependentTextLineRecognitionTask +lines_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/lines/lines.tgz +xml_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/xml/xml.tgz +data_split_info_url=http://www.fki.inf.unibe.ch/DBs/iamDB/tasks/largeWriterIndependentTextLineRecognitionTask.zip +ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz +brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt +lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip +mkdir -p $download_dir data/local + +# download and extact images and transcription +if [ -d $lines ]; then + echo "$0: Not downloading lines images as it is already there." +else + if [ ! -f $download_dir/lines.tgz ]; then + echo "$0: Trying to download lines images..." + wget -P $download_dir --user "$username" --password "$password" $lines_url || exit 1; + fi + mkdir -p $lines + tar -xzf $download_dir/lines.tgz -C $lines || exit 1; + echo "$0: Done downloading and extracting lines images" +fi + +if [ -d $xml ]; then + echo "$0: Not downloading transcriptions as it is already there." +else + if [ ! -f $download_dir/xml.tgz ]; then + echo "$0: Trying to download transcriptions..." + wget -P $download_dir --user "$username" --password "$password" $xml_url || exit 1; + fi + mkdir -p $xml + tar -xzf $download_dir/xml.tgz -C $xml || exit 1; + echo "$0: Done downloading and extracting transcriptions." +fi + +if [ -d $data_split_info ]; then + echo "$0: Not downloading data split information as it is already there." +else + if [ ! -f $download_dir/largeWriterIndependentTextLineRecognitionTask.zip ]; then + echo "$0: Trying to download training and testing data split information..." + wget -P $download_dir --user "$username" --password "$password" $data_split_info_url || exit 1; + fi + mkdir -p $data_split_info + unzip $download_dir/largeWriterIndependentTextLineRecognitionTask.zip -d $data_split_info || exit 1; + echo "$0: Done downloading and extracting training and testing data split information" +fi + +if [ -d $ascii ]; then + echo "$0: Not downloading ascii.tgz as it is already there." +else + if [ ! -f $download_dir/ascii.tgz ]; then + echo "$0: trying to download ascii.tgz..." + wget -P $download_dir --user "$username" --password "$password" $ascii_url || exit 1; + fi + mkdir -p $ascii + tar -xzf $download_dir/ascii.tgz -C $ascii || exit 1; + echo "$0: Done downloading and extracting ascii.tgz" +fi + +if [ -d $lobcorpus ]; then + echo "$0: Not downloading the LOB text corpus as it is already there." +else + if [ ! -f $lobcorpus/0167.zip ]; then + echo "$0: Downloading the LOB text corpus ..." + mkdir -p $lobcorpus + wget -P $lobcorpus/ $lob_corpus_url || exit 1; + fi + unzip $lobcorpus/0167.zip -d $lobcorpus || exit 1; + echo "$0: Done downloading and extracting LOB corpus" +fi + +if [ -d $bcorpus ]; then + echo "$0: Not downloading the Brown corpus as it is already there." +else + if [ ! -f $bcorpus/brown.txt ]; then + mkdir -p $bcorpus + echo "$0: Downloading the Brown text corpus..." + wget -P $bcorpus $brown_corpus_url || exit 1; + fi + echo "$0: Done downloading the Brown text corpus" +fi + +mkdir -p data/{train,test,val} +file_name=largeWriterIndependentTextLineRecognitionTask + +train_old="data/local/$file_name/trainset.txt" +test_old="data/local/$file_name/testset.txt" +val1_old="data/local/$file_name/validationset1.txt" +val2_old="data/local/$file_name/validationset2.txt" + +train_new="data/local/train.uttlist" +test_new="data/local/test.uttlist" +val_new="data/local/validation.uttlist" + +cat $train_old > $train_new +cat $test_old > $test_new +cat $val1_old $val2_old > $val_new + +if [ $stage -le 0 ]; then + local/process_data.py data/local data/train --dataset train || exit 1 + local/process_data.py data/local data/test --dataset test || exit 1 + local/process_data.py data/local data/val --dataset validation || exit 1 + + utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt + utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt +fi diff --git a/egs/iam/v1/local/prepare_dict.sh b/egs/iam/v1/local/prepare_dict.sh new file mode 100755 index 00000000000..77a46df384f --- /dev/null +++ b/egs/iam/v1/local/prepare_dict.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +# Copyright 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +# This script prepares the dictionary. + +set -e +dir=data/local/dict +mkdir -p $dir + +# First get the set of all letters that occur in data/train/text +cat data/train/text | \ + perl -ne '@A = split; shift @A; for(@A) {print join("\n", split(//)), "\n";}' | \ + sort -u > $dir/nonsilence_phones.txt + +# Now list all the unique words (that use only the above letters) +# in data/train/text and LOB+Brown corpora with their comprising +# letters as their transcription. (Letter # is replaced with ) + +export letters=$(cat $dir/nonsilence_phones.txt | tr -d "\n") + +cut -d' ' -f2- data/train/text | \ + cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt \ + data/local/browncorpus/brown.txt - | \ + perl -e '$letters=$ENV{letters}; +while(<>){ @A = split; + foreach(@A) { + if(! $seen{$_} && $_ =~ m/^[$letters]+$/){ + $seen{$_} = 1; + $trans = join(" ", split(//)); + $trans =~ s/#//g; + print "$_ $trans\n"; + } + } +}' | sort > $dir/lexicon.txt + + +sed -i "s/#//" $dir/nonsilence_phones.txt + +echo ' SIL' >> $dir/lexicon.txt +echo ' SIL' >> $dir/lexicon.txt + +echo SIL > $dir/silence_phones.txt + +echo SIL >$dir/optional_silence.txt + +echo -n "" >$dir/extra_questions.txt diff --git a/egs/iam/v1/local/process_data.py b/egs/iam/v1/local/process_data.py new file mode 100755 index 00000000000..fa5eb484707 --- /dev/null +++ b/egs/iam/v1/local/process_data.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +""" This script reads the extracted IAM database files and creates + the following files (for the data subset selected via --dataset): + text, utt2spk, images.scp. + + Eg. local/process_data.py data/local data/train data --dataset train + Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from + utt2spk file: 000_a01-000u-00 000 + images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png +""" + +import argparse +import os +import sys +import xml.dom.minidom as minidom + +parser = argparse.ArgumentParser(description="""Creates text, utt2spk + and images.scp files.""") +parser.add_argument('database_path', type=str, + help='Path to the downloaded (and extracted) IAM data') +parser.add_argument('out_dir', type=str, + help='Where to write output files.') +parser.add_argument('--dataset', type=str, default='train', + choices=['train', 'test','validation'], + help='Subset of data to process.') +args = parser.parse_args() + +text_file = os.path.join(args.out_dir + '/', 'text') +text_fh = open(text_file, 'w') + +utt2spk_file = os.path.join(args.out_dir + '/', 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w') + +image_file = os.path.join(args.out_dir + '/', 'images.scp') +image_fh = open(image_file, 'w') + +dataset_path = os.path.join(args.database_path, + args.dataset + '.uttlist') + +text_file_path = os.path.join(args.database_path, + 'ascii','lines.txt') +text_dict = {} +def process_text_file_for_word_model(): + with open (text_file_path, 'rt') as in_file: + for line in in_file: + if line[0]=='#': + continue + line = line.strip() + utt_id = line.split(' ')[0] + text_vect = line.split(' ')[8:] + text = "".join(text_vect) + text = text.replace("|", " ") + text_dict[utt_id] = text + +print("Processing '{}' data...".format(args.dataset)) +process_text_file_for_word_model() + +with open(dataset_path) as f: + for line in f: + line = line.strip() + line_vect = line.split('-') + xml_file = line_vect[0] + '-' + line_vect[1] + xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml') + img_num = line[-3:] + doc = minidom.parse(xml_path) + + form_elements = doc.getElementsByTagName('form')[0] + writer_id = form_elements.getAttribute('writer-id') + outerfolder = form_elements.getAttribute('id')[0:3] + innerfolder = form_elements.getAttribute('id') + lines_path = os.path.join(args.database_path, 'lines', + outerfolder, innerfolder, innerfolder) + image_file_path = lines_path + img_num + '.png' + text = text_dict[line] + utt_id = writer_id + '_' + line + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') diff --git a/egs/iam/v1/local/score.sh b/egs/iam/v1/local/score.sh new file mode 100755 index 00000000000..31564d25326 --- /dev/null +++ b/egs/iam/v1/local/score.sh @@ -0,0 +1,5 @@ +#!/bin/bash + + +steps/scoring/score_kaldi_wer.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" diff --git a/egs/iam/v1/local/train_lm.sh b/egs/iam/v1/local/train_lm.sh new file mode 100755 index 00000000000..aa4303d6a28 --- /dev/null +++ b/egs/iam/v1/local/train_lm.sh @@ -0,0 +1,139 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 +# +# This script trains an LM on the LOB+Brown text data and IAM training transcriptions. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 + +echo "$0 $@" # Print the command line for logging +. ./utils/parse_options.sh || exit 1; + +dir=data/local/local_lm +lm_dir=${dir}/data + + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt= +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # Using LOB and brown corpus. + cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt > ${dir}/data/text/text.txt + cat data/local/browncorpus/brown.txt >> ${dir}/data/text/text.txt + + # use the validation data as the dev set. + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + + cat data/val/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + + # use the training data as an additional data source. + # we can later fold the dev data into this. + cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/iam.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (the validation data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights.) + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt + + # get the wordlist from IAM text + cat ${dir}/data/text/{iam,text}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +order=3 + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='train=2 iam=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20 \ + --limit-unk-history=true \ + ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + #log-prob: -5.05603614242 [perplexity = 156.967086371] over 19477.0 words +fi + +if [ $stage -le 2 ]; then + echo "$0: pruning the LM (to larger size)" + # Using 1 million n-grams for a big LM for rescoring purposes. + size=1000000 + prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' + # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_3_prune_big was -5.06654404785 per word [perplexity = 158.625177948] over 19477.0 words + # current results, after adding --limit-unk-history=true: + + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 3 ]; then + echo "$0: pruning the LM (to smaller size)" + # Using 500,000 n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + size=500000 + prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' + # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_3_prune_small was -5.24719139498 per word [perplexity = 190.031793995] over 19477.0 words + # current results, after adding --limit-unk-history=true (needed for modeling OOVs and not blowing up LG.fst): + + + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi diff --git a/egs/iam/v1/local/wer_output_filter b/egs/iam/v1/local/wer_output_filter new file mode 100755 index 00000000000..162482539ed --- /dev/null +++ b/egs/iam/v1/local/wer_output_filter @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +# Copyright 2017 Hossein Hadian + +# This is a filter used in scoring. It separates all +# punctuations from words. For e.g. this sentence: + +# "They have come!" he said reverently, gripping his +# hands. "Isn't it a glorious thing! Long awaited." + +# is converted to this: + +# " They have come ! " he said reverently , gripping his +# hands . " Isn ' t it a glorious thing ! Long awaited . " + +import sys +import re + +punctuations = "!(),.?;:'-\"" +escaped_punctuations = re.escape(punctuations) + +for line in sys.stdin: + words = line.strip().split() + uttid = words[0] + transcript = ' '.join(words[1:]) + split_transcript = " ".join(re.split("([{}])".format(escaped_punctuations), + transcript)).strip() + print("{} {}".format(uttid, split_transcript)) diff --git a/egs/iam/v1/path.sh b/egs/iam/v1/path.sh new file mode 100755 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/iam/v1/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/iam/v1/run.sh b/egs/iam/v1/run.sh new file mode 100755 index 00000000000..c8ebb9ae649 --- /dev/null +++ b/egs/iam/v1/run.sh @@ -0,0 +1,122 @@ +#!/bin/bash + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2017 Hossein Hadian + +set -e +stage=0 +nj=20 + +# iam_database points to the database path on the JHU grid. If you have not +# already downloaded the database you can set it to a local directory +# like "data/download" and follow the instructions +# in "local/prepare_data.sh" to download the database: +iam_database=/export/corpora5/handwriting_ocr/IAM + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. + +if [ $stage -le 0 ]; then + echo "$0: Preparing data..." + local/prepare_data.sh --download-dir "$iam_database" +fi +mkdir -p data/{train,test}/data + +if [ $stage -le 1 ]; then + echo "$0: Preparing the test and train feature files..." + for dataset in train test; do + local/make_features.py data/$dataset --feat-dim 40 | \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:data/$dataset/data/images.ark,data/$dataset/feats.scp + steps/compute_cmvn_stats.sh data/$dataset + done +fi + +if [ $stage -le 2 ]; then + echo "$0: Preparing dictionary and lang..." + local/prepare_dict.sh + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \ + data/local/dict "" data/lang/temp data/lang +fi + +if [ $stage -le 3 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_big.arpa.gz \ + data/local/dict/lexicon.txt data/lang_test +fi + +if [ $stage -le 4 ]; then + steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train \ + data/lang exp/mono +fi + +if [ $stage -le 5 ]; then + utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph + + steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \ + exp/mono/decode_test +fi + +if [ $stage -le 6 ]; then + steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \ + exp/mono exp/mono_ali + + steps/train_deltas.sh --cmd $cmd 500 20000 data/train data/lang \ + exp/mono_ali exp/tri +fi + +if [ $stage -le 7 ]; then + utils/mkgraph.sh data/lang_test exp/tri exp/tri/graph + + steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \ + exp/tri/decode_test +fi + +if [ $stage -le 8 ]; then + steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \ + exp/tri exp/tri_ali + + steps/train_lda_mllt.sh --cmd $cmd \ + --splice-opts "--left-context=3 --right-context=3" 500 20000 \ + data/train data/lang exp/tri_ali exp/tri2 +fi + +if [ $stage -le 9 ]; then + utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph + + steps/decode.sh --nj $nj --cmd $cmd exp/tri2/graph \ + data/test exp/tri2/decode_test +fi + +if [ $stage -le 10 ]; then + steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ + data/train data/lang exp/tri2 exp/tri2_ali + + steps/train_sat.sh --cmd $cmd 500 20000 \ + data/train data/lang exp/tri2_ali exp/tri3 +fi + +if [ $stage -le 11 ]; then + utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph + + steps/decode_fmllr.sh --nj $nj --cmd $cmd exp/tri3/graph \ + data/test exp/tri3/decode_test +fi + +if [ $stage -le 12 ]; then + steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ + data/train data/lang exp/tri3 exp/tri3_ali +fi + +if [ $stage -le 13 ]; then + local/chain/run_cnn_1a.sh +fi + +if [ $stage -le 14 ]; then + local/chain/run_cnn_chainali_1b.sh --chain-model-dir exp/chain/cnn_1a --stage 2 +fi diff --git a/egs/iam/v1/steps b/egs/iam/v1/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/iam/v1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/iam/v1/utils b/egs/iam/v1/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/iam/v1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/uw3/README.txt b/egs/uw3/README.txt new file mode 100644 index 00000000000..b02d00ff541 --- /dev/null +++ b/egs/uw3/README.txt @@ -0,0 +1,4 @@ + +This directory contains example scripts for optical character recognition +(i.e. OCR) on the UW3 dataset (it's a printed English OCR corpus): +http://isis-data.science.uva.nl/events/dlia//datasets/uwash3.html diff --git a/egs/uw3/v1/cmd.sh b/egs/uw3/v1/cmd.sh new file mode 100644 index 00000000000..3c8eb9f93a5 --- /dev/null +++ b/egs/uw3/v1/cmd.sh @@ -0,0 +1,13 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export cmd="queue.pl" diff --git a/egs/uw3/v1/image b/egs/uw3/v1/image new file mode 120000 index 00000000000..1668ee99922 --- /dev/null +++ b/egs/uw3/v1/image @@ -0,0 +1 @@ +../../cifar/v1/image/ \ No newline at end of file diff --git a/egs/uw3/v1/local/chain/compare_wer.sh b/egs/uw3/v1/local/chain/compare_wer.sh new file mode 100755 index 00000000000..1a40523355a --- /dev/null +++ b/egs/uw3/v1/local/chain/compare_wer.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# ./local/chain/compare_wer.sh exp/chain/cnn1a +# System cnn1a +# WER 0.61 +# CER 0.15 +# Final train prob -0.0377 +# Final valid prob -0.0380 +# Final train prob (xent) -0.0830 +# Final valid prob (xent) -0.0838 + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER " +for x in $*; do + cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/uw3/v1/local/chain/run_cnn_1a.sh b/egs/uw3/v1/local/chain/run_cnn_1a.sh new file mode 100755 index 00000000000..ad7421e1261 --- /dev/null +++ b/egs/uw3/v1/local/chain/run_cnn_1a.sh @@ -0,0 +1,234 @@ +#!/bin/bash + +# Copyright 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +# steps/info/chain_dir_info.pl exp/chain/cnn1a/ +# exp/chain/cnn1a/: num-iters=153 nj=3..10 num-params=3.6M dim=40->268 combine=-0.034->-0.034 xent:train/valid[101,152,final]=(-0.097,-0.186,-0.092/-0.101,-0.212,-0.098) logprob:train/valid[101,152,final]=(-0.035,-0.067,-0.035/-0.036,-0.082,-0.035) + +# cat exp/chain/cnn1a/decode_test/scoring_kaldi/best_* +# %WER 0.19 [ 366 / 188135, 110 ins, 123 del, 133 sub ] exp/chain/cnn1a/decode_test/cer_7_0.5 +# %WER 1.00 [ 357 / 35571, 104 ins, 26 del, 227 sub ] exp/chain/cnn1a/decode_test/wer_5_1.0 + + +set -e -o pipefail + +stage=0 +nj=30 + +# affix for exp dirs, e.g. it was _cleaned in tedlium. +nnet3_affix= + +affix=1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=5 + +# training chunk-options +chunk_width=340,300,200,100 + +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 + +# training options +srand=0 +remove_egs=false + +gmm_dir=exp/tri2 +ali_dir=exp/tri2_ali +lat_dir=exp/chain${nnet3_affix}/tri2_train_lats +dir=exp/chain${nnet3_affix}/cnn${affix} +train_data_dir=data/train +lores_train_data_dir=$train_data_dir # for the start, use the same data for gmm and chain +gmm_lang=data/lang +lang_test=data/lang_unk +tree_dir=exp/chain${nnet3_affix}/tree${affix} + +# the 'lang' directory is created by this script. +# If you create such a directory with a non-standard topology +# you should probably name it differently. +lang=data/lang_chain + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj $nj --cmd "$cmd" ${lores_train_data_dir} \ + $lang_test $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" 300 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + common1="required-time-offsets=0 height-offsets=-2,-1,0,1,2 num-filters-out=12" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-2,-1,0,1,2 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=40 time-offsets=-2,-1,0,1,2 $common1 + relu-batchnorm-layer name=tdnn1 input=Append(-2,-1,0,1,2) dim=450 + relu-batchnorm-layer name=tdnn2 input=Append(-5,0,5) dim=450 + relu-batchnorm-layer name=tdnn3 input=Append(-5,0,5) dim=450 + relu-batchnorm-layer name=tdnn4 input=Append(-5,0,5) dim=450 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=450 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn4 dim=450 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=$frame_subsampling_factor \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1500000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=10 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/uw3/v1/local/make_features.py b/egs/uw3/v1/local/make_features.py new file mode 100755 index 00000000000..dd0a30a19d7 --- /dev/null +++ b/egs/uw3/v1/local/make_features.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang + +""" This script converts images to Kaldi-format feature matrices. The input to + this script is the path to a data directory, e.g. "data/train". This script + reads the images listed in images.scp and writes them to standard output + (by default) as Kaldi-formatted matrices (in text form). It also scales the + images so they have the same height (via --feat-dim). It can optionally pad + the images (on left/right sides) with white pixels. + + eg. local/make_features.py data/train --feat-dim 40 +""" + +import argparse +import os +import sys +import numpy as np +from scipy import misc +from scipy import ndimage + +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE,SIG_DFL) + +parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and + writes them to standard output in text format.""") +parser.add_argument('dir', type=str, help='data directory (should contain images.scp)') +parser.add_argument('--out-ark', type=str, default='-', help='where to write the output feature file.') +parser.add_argument('--feat-dim', type=int, default=40, + help='size to scale the height of all images (i.e. the dimension of the resulting features)') +parser.add_argument('--pad', type=bool, default=False, help='pad the left and right of the images with 10 white pixels.') + +args = parser.parse_args() + + +def write_kaldi_matrix(file_handle, matrix, key): + file_handle.write(key + " [ ") + num_rows = len(matrix) + if num_rows == 0: + raise Exception("Matrix is empty") + num_cols = len(matrix[0]) + for row_index in range(len(matrix)): + if num_cols != len(matrix[row_index]): + raise Exception("All the rows of a matrix are expected to " + "have the same length") + file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + if row_index != num_rows - 1: + file_handle.write("\n") + file_handle.write(" ]\n") + +def get_scaled_image(im): + scale_size = args.feat_dim + sx = im.shape[1] + sy = im.shape[0] + # Some Images are rotated + if sy > sx: + im = np.rot90(im, k = -1) + sx = im.shape[1] + sy = im.shape[0] + + scale = (1.0 * scale_size) / sy + nx = int(scale_size) + ny = int(scale * sx) + im = misc.imresize(im, (nx, ny)) + + noise = np.random.normal(2, 1,(nx, ny)) + im = im - noise + + return im + +### main ### +data_list_path = os.path.join(args.dir,'images.scp') + +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark,'wb') + +with open(data_list_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + image_id = line_vect[0] + image_path = line_vect[1] + + im = misc.imread(image_path, flatten = True) + im_scale = get_scaled_image(im) + + if args.pad: + pad = np.ones((args.feat_dim, 10)) * 255 + im_data = np.hstack((pad, im_scale, pad)) + else: + im_data = im_scale + + data = np.transpose(im_data, (1, 0)) + data = np.divide(data, 255.0) + write_kaldi_matrix(out_fh, data, image_id) diff --git a/egs/uw3/v1/local/prepare_data.sh b/egs/uw3/v1/local/prepare_data.sh new file mode 100755 index 00000000000..47f62e4335a --- /dev/null +++ b/egs/uw3/v1/local/prepare_data.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# Copyright 2017 Chun Chieh Chang + +# This script downloads the UW3 dataset (if not already downloaded) +# and prepares the "train" and "test" data subsets. + +set -e +download_dir=data/download + +. ./cmd.sh +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +# Download dir +download_url=http://www.tmbdev.net/ocrdata/uw3-lines-book.tgz +data_dir=data/local/extracted_corpus + +mkdir -p $download_dir +mkdir -p $data_dir + +if [ -d $data_dir/book ]; then + echo "$0: Not downloading dataset as it is already downloaded." +else + if [ ! -f $download_dir/uw3-lines-book.tgz ]; then + echo "$0: Downloading dataset..." + wget -P $download_dir $download_url || exit 1; + fi + echo "$0: Extracting..." + tar -xzf $download_dir/uw3-lines-book.tgz -C $data_dir/ || exit 1; + echo "$0: Done downloading/extracting the datset." +fi + +mkdir -p data/train +mkdir -p data/test +echo "$0: Preparing the test and train subsets..." +local/process_data.py $data_dir/book data || exit 1 + +utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt +utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt diff --git a/egs/uw3/v1/local/prepare_dict.sh b/egs/uw3/v1/local/prepare_dict.sh new file mode 100755 index 00000000000..72c9b50e5ec --- /dev/null +++ b/egs/uw3/v1/local/prepare_dict.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# Copyright 2017 Chun Chieh Chang +# 2017 Hossein Hadian + +set -e +dir=data/local/dict + +mkdir -p $dir + +cut -d' ' -f2- data/train/text | tr -cs '[a-z][A-Z][0-9][:punct:]' '\n' | sort -u | \ + awk '{len=split($0,chars,""); printf($0); + for (i=0;i<=len;i++) { + if(chars[i]=="#") {chars[i]=""} + printf(chars[i]" ") + }; + printf("\n")};' | \ + sed 's/.$//' > $dir/lexicon.txt; + +cut -d' ' -f2- $dir/lexicon.txt | tr ' ' '\n' | sort -u >$dir/nonsilence_phones.txt + +echo ' SIL' >> $dir/lexicon.txt +echo ' SIL' >> $dir/lexicon.txt + +echo SIL > $dir/silence_phones.txt + +echo SIL > $dir/optional_silence.txt + +echo -n "" > $dir/extra_questions.txt diff --git a/egs/uw3/v1/local/process_data.py b/egs/uw3/v1/local/process_data.py new file mode 100755 index 00000000000..f5b37b04c2f --- /dev/null +++ b/egs/uw3/v1/local/process_data.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang + +# This script goes through the downloaded UW3 dataset and creates data files "text", +# "utt2spk", and "images.scp" for the train and test subsets in data/train and data/test. + +# text - matches the transcriptions with the image id +# utt2spk - matches the image id's with the speaker/writer names +# images.scp - matches the image is's with the actual image file + +import argparse +import os +import random + +parser = argparse.ArgumentParser(description="""Creates data/train and data/test.""") +parser.add_argument('database_path', type=str, help='path to downloaded (and extracted) UW3 corpus') +parser.add_argument('out_dir', type=str, default='data', + help='where to create the train and test data directories') +args = parser.parse_args() + +### main ### +train_text_file = os.path.join(args.out_dir, 'train', 'text') +train_text_fh = open(train_text_file, 'w+') +train_utt2spk_file = os.path.join(args.out_dir, 'train', 'utt2spk') +train_utt2spk_fh = open(train_utt2spk_file, 'w+') +train_image_file = os.path.join(args.out_dir, 'train', 'images.scp') +train_image_fh = open(train_image_file, 'w+') + +test_text_file = os.path.join(args.out_dir, 'test', 'text') +test_text_fh = open(test_text_file, 'w+') +test_utt2spk_file = os.path.join(args.out_dir, 'test', 'utt2spk') +test_utt2spk_fh = open(test_utt2spk_file, 'w+') +test_image_file = os.path.join(args.out_dir, 'test', 'images.scp') +test_image_fh = open(test_image_file, 'w+') + +random.seed(0) +page_count = 0 +for page in sorted(os.listdir(args.database_path)): + page_path = os.path.join(args.database_path, page) + page_count = page_count + 1 + for line in sorted(os.listdir(page_path)): + if line.endswith('.txt'): + text_path = os.path.join(args.database_path, page, line) + image_name = line.split('.')[0] + image_path = os.path.join(args.database_path, page, image_name + '.png') + utt_id = page + '_' + image_name + gt_fh = open(text_path, 'r') + text = gt_fh.readlines()[0].strip() + + # The UW3 dataset doesn't have established training and testing splits + # The dataset is randomly split train 95% and test 5% + coin = random.randint(0, 20) + if coin >= 1: + train_text_fh.write(utt_id + ' ' + text + '\n') + train_utt2spk_fh.write(utt_id + ' ' + str(page_count) + '\n') + train_image_fh.write(utt_id + ' ' + image_path + '\n') + elif coin < 1: + test_text_fh.write(utt_id + ' ' + text + '\n') + test_utt2spk_fh.write(utt_id + ' ' + str(page_count) + '\n') + test_image_fh.write(utt_id + ' ' + image_path + '\n') diff --git a/egs/uw3/v1/local/score.sh b/egs/uw3/v1/local/score.sh new file mode 100755 index 00000000000..9ea4701a833 --- /dev/null +++ b/egs/uw3/v1/local/score.sh @@ -0,0 +1,156 @@ +#!/bin/bash +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey, Yenda Trmal) +# Apache 2.0 + +# This script is like steps/scoring/score_kaldi_wer.sh except it transcribes the 's +# using local/unk_arc_post_to_transcription.py and also it calls +# steps/scoring/score_kaldi_cer.sh at the end. + +[ -f ./path.sh ] && . ./path.sh + +# begin configuration section. +cmd=run.pl +stage=0 +decode_mbr=false +stats=true +beam=6 +word_ins_penalty=0.0,0.5,1.0 +min_lmwt=3 +max_lmwt=13 +iter=final +#end configuration section. + +echo "$0 $@" # Print the command line for logging +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: $0 [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --stage (0|1|2) # start scoring script from part-way through." + echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + exit 1; +fi + +data=$1 +lang_or_graph=$2 +dir=$3 +model_path=`echo $dir |xargs dirname` +symtab=$lang_or_graph/words.txt + +for f in $symtab $dir/lat.1.gz $data/text; do + [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; +done + + +ref_filtering_cmd="cat" +[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter" +[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter" +hyp_filtering_cmd="cat" +[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter" +[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter" + + +if $decode_mbr ; then + echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty" +else + echo "$0: scoring with word insertion penalty=$word_ins_penalty" +fi + + +mkdir -p $dir/scoring_kaldi +cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1; +if [ $stage -le 0 ]; then + + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + mkdir -p $dir/scoring_kaldi/penalty_$wip/log + + if $decode_mbr ; then + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + acwt=\`perl -e \"print 1.0/LMWT\"\`\; \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-prune --beam=$beam ark:- ark:- \| \ + lattice-mbr-decode --word-symbol-table=$symtab \ + ark:- ark,t:- \| \ + utils/int2sym.pl -f 2- $symtab \| \ + $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; + + else + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-1best ark:- ark:- \| \ + lattice-align-words $lang_or_graph/phones/word_boundary.int $model_path/final.mdl ark:- ark:- \| \ + lattice-arc-post $model_path/final.mdl ark:- - \| \ + local/unk_arc_post_to_transcription.py $lang_or_graph/phones.txt $lang_or_graph/words.txt data/lang_unk/oov.int \| \ + $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; + fi + + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \ + cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \ + tr '[:upper:]' '[:lower:]' \| \ + compute-wer --text --mode=present \ + "ark:cat $dir/scoring_kaldi/test_filt.txt| tr '[:upper:]' '[:lower:]' |" ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; + + done +fi + + + +if [ $stage -le 1 ]; then + + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + for lmwt in $(seq $min_lmwt $max_lmwt); do + # adding /dev/null to the command list below forces grep to output the filename + grep WER $dir/wer_${lmwt}_${wip} /dev/null + done + done | utils/best_wer.sh >& $dir/scoring_kaldi/best_wer || exit 1 + + best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer) + best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}') + best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}') + + if [ -z "$best_lmwt" ]; then + echo "$0: we could not get the details of the best WER from the file $dir/wer_*. Probably something went wrong." + exit 1; + fi + + if $stats; then + mkdir -p $dir/scoring_kaldi/wer_details + echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight + echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty + + $cmd $dir/scoring_kaldi/log/stats1.log \ + cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \ + align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \ + utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\ + utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1; + + $cmd $dir/scoring_kaldi/log/stats2.log \ + cat $dir/scoring_kaldi/wer_details/per_utt \| \ + utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \ + sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1; + + $cmd $dir/scoring_kaldi/log/wer_bootci.log \ + compute-wer-bootci --mode=present \ + ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \ + '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1; + + fi +fi + +steps/scoring/score_kaldi_cer.sh --cmd "$cmd" --stage 2 $data $lang_or_graph $dir + +# If we got here, the scoring was successful. +# As a small aid to prevent confusion, we remove all wer_{?,??} files; +# these originate from the previous version of the scoring files +# i keep both statement here because it could lead to confusion about +# the capabilities of the script (we don't do cer in the script) +rm $dir/wer_{?,??} 2>/dev/null +rm $dir/cer_{?,??} 2>/dev/null + +exit 0; diff --git a/egs/uw3/v1/local/train_lm.sh b/egs/uw3/v1/local/train_lm.sh new file mode 100755 index 00000000000..39eb051d273 --- /dev/null +++ b/egs/uw3/v1/local/train_lm.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 +# +# +# This script trains an LM on the UW3 training transcriptions. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 + +echo "$0 $@" # Print the command line for logging +. utils/parse_options.sh || exit 1; + +dir=data/local/local_lm +lm_dir=${dir}/data + +. ./path.sh +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +num_dev_sentences=4500 +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt= +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + head -n $num_dev_sentences < data/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + tail -n +$[$num_dev_sentences+1] < data/train/text | cut -d " " -f 2- > ${dir}/data/text/uw3.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (a subset of the training data is used as ${dir}/data/text/uw3.txt to work + # out interpolation weights. + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt + + # get wordlist + cat ${dir}/data/text/uw3.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +order=3 + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='train=2 uw3=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20 \ + --limit-unk-history=true \ + --fold-dev-into=uw3 ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + + # No need for pruning as the training data is quite small (total # of + # n-grams is 685k). Write the arpa: + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz +fi diff --git a/egs/uw3/v1/local/unk_arc_post_to_transcription.py b/egs/uw3/v1/local/unk_arc_post_to_transcription.py new file mode 100755 index 00000000000..c86d35e4b8a --- /dev/null +++ b/egs/uw3/v1/local/unk_arc_post_to_transcription.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python + +# Copyright 2017 Ashish Arora + +import argparse +import sys + +parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""") +parser.add_argument('phones', type=str, help='phones and phonesID') +parser.add_argument('words', type=str, help='word and wordID') +parser.add_argument('unk', type=str, default='-', help='location of unk file') +parser.add_argument('--input-ark', type=str, default='-', help='where to read the input data') +parser.add_argument('--out-ark', type=str, default='-', help='where to write the output data') +args = parser.parse_args() +### main ### +phone_fh = open(args.phones, 'r') +word_fh = open(args.words, 'r') +unk_fh = open(args.unk,'r') +if args.input_ark == '-': + input_fh = sys.stdin +else: + input_fh = open(args.input_ark,'r') +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark,'wb') + +phone_dict = dict()# stores phoneID and phone mapping +phone_data_vect = phone_fh.read().strip().split("\n") +for key_val in phone_data_vect: + key_val = key_val.split(" ") + phone_dict[key_val[1]] = key_val[0] +word_dict = dict() +word_data_vect = word_fh.read().strip().split("\n") +for key_val in word_data_vect: + key_val = key_val.split(" ") + word_dict[key_val[1]] = key_val[0] +unk_val = unk_fh.read().strip().split(" ")[0] + +utt_word_dict = dict() +utt_phone_dict = dict()# stores utteranceID and phoneID +unk_word_dict = dict() +count=0 +for line in input_fh: + line_vect = line.strip().split("\t") + if len(line_vect) < 6: + print "IndexError" + print line_vect + continue + uttID = line_vect[0] + word = line_vect[4] + phones = line_vect[5] + if uttID in utt_word_dict.keys(): + utt_word_dict[uttID][count] = word + utt_phone_dict[uttID][count] = phones + else: + count = 0 + utt_word_dict[uttID] = dict() + utt_phone_dict[uttID] = dict() + utt_word_dict[uttID][count] = word + utt_phone_dict[uttID][count] = phones + if word == unk_val: # get character sequence for unk + phone_key_vect = phones.split(" ") + phone_val_vect = list() + for pkey in phone_key_vect: + phone_val_vect.append(phone_dict[pkey]) + phone_2_word = list() + for phone_val in phone_val_vect: + phone_2_word.append(phone_val.split('_')[0]) + phone_2_word = ''.join(phone_2_word) + utt_word_dict[uttID][count] = phone_2_word + else: + if word == '0': + word_val = ' ' + else: + word_val = word_dict[word] + utt_word_dict[uttID][count] = word_val + count += 1 + +transcription = "" +for key in sorted(utt_word_dict.iterkeys()): + transcription = key + for index in sorted(utt_word_dict[key].iterkeys()): + value = utt_word_dict[key][index] + transcription = transcription + " " + value + out_fh.write(transcription + '\n') diff --git a/egs/uw3/v1/path.sh b/egs/uw3/v1/path.sh new file mode 100755 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/uw3/v1/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/uw3/v1/run.sh b/egs/uw3/v1/run.sh new file mode 100755 index 00000000000..68c51fa4690 --- /dev/null +++ b/egs/uw3/v1/run.sh @@ -0,0 +1,106 @@ +#!/bin/bash + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2017 Hossein Hadian + + +set -e +stage=0 +nj=30 + +# This is the database path on the JHU grid. You may set this +# to data/download, in which case the script will automatically download +# the database: +uw3_database=/export/a10/corpora5/handwriting_ocr/UW3/ + +. ./path.sh +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. utils/parse_options.sh # e.g. this parses the --stage option if supplied. + + +if [ $stage -le 0 ]; then + # Data preparation + local/prepare_data.sh --download-dir "$uw3_database" +fi + +mkdir -p data/{train,test}/data +if [ $stage -le 1 ]; then + echo "$0: Preparing feature files for the test and training data..." + for f in train test; do + local/make_features.py --feat-dim 40 --pad true data/$f | \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:data/$f/data/images.ark,data/$f/feats.scp || exit 1 + + steps/compute_cmvn_stats.sh data/$f || exit 1; + done +fi + +if [ $stage -le 2 ]; then + echo "$0: Preparing dictionary and lang..." + local/prepare_dict.sh + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 \ + data/local/dict "" data/lang/temp data/lang +fi + +if [ $stage -le 3 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + data/local/dict/lexicon.txt data/lang_test + + echo "$0: Preparing the unk model for open-vocab decoding..." + utils/lang/make_unk_lm.sh --ngram-order 4 --num-extra-ngrams 7500 \ + data/local/dict exp/unk_lang_model + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 \ + --unk-fst exp/unk_lang_model/unk_fst.txt \ + data/local/dict "" data/lang_unk/temp data/lang_unk + cp data/lang_test/G.fst data/lang_unk/G.fst +fi + +if [ $stage -le 4 ]; then + steps/train_mono.sh --nj $nj --cmd $cmd \ + data/train data/lang exp/mono +fi + +if [ $stage -le 5 ]; then + steps/align_si.sh --nj $nj --cmd $cmd \ + data/train data/lang exp/mono exp/mono_ali + steps/train_deltas.sh --cmd $cmd 500 20000 \ + data/train data/lang exp/mono_ali exp/tri +fi + +if [ $stage -le 6 ]; then + steps/align_si.sh --nj $nj --cmd $cmd \ + data/train data/lang exp/tri exp/tri_ali + steps/train_lda_mllt.sh --cmd $cmd --splice-opts "--left-context=3 --right-context=3" 500 20000 \ + data/train data/lang exp/tri_ali exp/tri2 +fi + +if [ $stage -le 7 ]; then + utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph + steps/decode.sh --nj $nj --cmd $cmd \ + exp/mono/graph data/test exp/mono/decode_test +fi + +if [ $stage -le 8 ]; then + utils/mkgraph.sh data/lang_test exp/tri exp/tri/graph + steps/decode.sh --nj $nj --cmd $cmd \ + exp/tri/graph data/test exp/tri/decode_test +fi + +if [ $stage -le 9 ]; then + utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph + steps/decode.sh --nj $nj --cmd $cmd \ + exp/tri2/graph data/test exp/tri2/decode_test +fi + +if [ $stage -le 10 ]; then + steps/align_si.sh --nj $nj --cmd $cmd --use-graphs true \ + data/train data/lang exp/tri2 exp/tri2_ali +fi + +if [ $stage -le 11 ]; then + local/chain/run_cnn_1a.sh +fi diff --git a/egs/uw3/v1/steps b/egs/uw3/v1/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/uw3/v1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/uw3/v1/utils b/egs/uw3/v1/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/uw3/v1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file