From 53baf84ba1c3ef0a2065d203d09a800cb2ab9005 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 29 Jul 2015 19:44:38 -0400
Subject: [PATCH] Some new functionality in nnet3; including drafts of example
 scripts

---
 .gitignore                                    |   2 +
 egs/wsj/s5/local/nnet3/run_ivector_common.sh  |  84 ++++++++++++
 egs/wsj/s5/local/nnet3/run_tdnn.sh            |  68 ++++++++++
 egs/wsj/s5/run.sh                             |   1 -
 egs/wsj/s5/steps/nnet3/train_tdnn.sh          |  43 +++---
 src/nnet3bin/Makefile                         |   3 +-
 ...et-lda-stats.cc => nnet3-acc-lda-stats.cc} |   8 +-
 src/nnet3bin/nnet3-compute-from-egs.cc        | 125 +++++++++++++++++
 src/nnet3bin/nnet3-copy-egs.cc                |   4 +-
 src/nnet3bin/nnet3-merge-egs.cc               | 127 ++++++++++++++++++
 10 files changed, 435 insertions(+), 30 deletions(-)
 create mode 100755 egs/wsj/s5/local/nnet3/run_ivector_common.sh
 create mode 100755 egs/wsj/s5/local/nnet3/run_tdnn.sh
 rename src/nnet3bin/{nnet3-get-lda-stats.cc => nnet3-acc-lda-stats.cc} (97%)
 create mode 100644 src/nnet3bin/nnet3-compute-from-egs.cc
 create mode 100644 src/nnet3bin/nnet3-merge-egs.cc

diff --git a/.gitignore b/.gitignore
index e1becd8ae6f..0fe7c4ff1a9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@
 
 # emacs saves
 [#]*[#]
+.[#]*
 *~
 # .depend.mk files
 .depend.mk
@@ -59,6 +60,7 @@
 /tools/CLAPACK_include
 /tools/kaldi_lm
 /tools/env.sh
+/tools/rnnlm-hs-0.1b/rnnlm
 
 # /src/
 /src/kaldi.mk
diff --git a/egs/wsj/s5/local/nnet3/run_ivector_common.sh b/egs/wsj/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..e98213baa2e
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+# this script is called from scripts like run_ms.sh; it does the common stages
+# of the build, such as feature extraction.
+# This is actually the same as local/online/run_nnet2_common.sh, except
+# for the directory names.
+
+. cmd.sh
+mfccdir=mfcc
+
+stage=1
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if [ $stage -le 1 ]; then
+  for datadir in train_si284 test_eval93 test_dev93 test_eval92; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+    steps/make_mfcc.sh --nj 40 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+  done
+  utils/subset_data_dir.sh --first data/train_si284_hires 7138 data/train_si84_hires || exit 1
+fi
+
+1
+if [ $stage -le 2 ]; then
+  # We need to build a small system just because we need the LDA+MLLT transform
+  # to train the diag-UBM on top of.  We align the si84 data for this purpose.
+
+  steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+    data/train_si84 data/lang exp/tri4b exp/nnet3/tri4b_ali_si84
+fi
+
+if [ $stage -le 3 ]; then
+  # Train a small system just for its LDA+MLLT transform.  We use --num-iters 13
+  # because after we get the transform (12th iter is the last), any further
+  # training is pointless.
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+    --realign-iters "" \
+    --splice-opts "--left-context=3 --right-context=3" \
+    5000 10000 data/train_si84_hires data/lang \
+     exp/nnet3/tri4b_ali_si84 exp/nnet3/tri5b
+fi
+
+if [ $stage -le 4 ]; then
+  mkdir -p exp/nnet3
+
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+     --num-frames 400000 data/train_si84_hires 256 exp/nnet3/tri5b exp/nnet3/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # even though $nj is just 10, each job uses multiple processes and threads.
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    data/train_si284_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # We extract iVectors on all the train_si284 data, which will be what we
+  # train the system on.
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_si284_hires \
+    data/train_si284_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+    data/train_si284_hires_max2 exp/nnet3/extractor exp/nnet3/ivectors_train_si284 || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  rm exp/nnet3/.error 2>/dev/null
+  for data in test_eval92 test_dev93 test_eval93; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 8 \
+      data/${data}_hires exp/nnet3/extractor exp/nnet3/ivectors_${data} || touch exp/nnet3/.error &
+  done
+  wait
+  [ -f exp/nnet3/.error ] && echo "$0: error extracting iVectors." && exit 1;
+fi
+
+exit 0;
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn.sh b/egs/wsj/s5/local/nnet3/run_tdnn.sh
new file mode 100755
index 00000000000..95bc524a497
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/run_tdnn.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# this is the standard "tdnn" system, built in nnet3; it's what we use to
+# call multi-splice.
+
+. cmd.sh
+
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+stage=0
+train_stage=-10
+dir=exp/nnet3/nnet_ms_a
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/online/run_ivector_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 8 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+
+  steps/nnet3/train_tdnn.sh --stage $train_stage \
+    --num-epochs 8 --num-jobs-initial 2 --num-jobs-final 14 \
+    --num-hidden-layers 4 \
+    --splice-indexes "-4,-3,-2,-1,0,1,2,3,4  0  -2,2  0  -4,4 0" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_train_si284 \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --io-opts "-tc 12" \
+    --initial-effective-lrate 0.005 --final-effective-lrate 0.0005 \
+    --cmd "$decode_cmd" \
+    --pnorm-input-dim 2000 \
+    --pnorm-output-dim 250 \
+    --mix-up 12000 \
+    data/train_si284_hires data/lang exp/tri4b_ali_si284 $dir  || exit 1;
+fi
+
+
+if [ $stage -le 9 ]; then
+  # this does offline decoding that should give the same results as the real
+  # online decoding.
+  for lm_suffix in tgpr bd_tgpr; do
+    graph_dir=exp/tri4b/graph_${lm_suffix}
+    # use already-built graphs.
+    for year in eval92 dev93; do
+      steps/nnet3/decode.sh --nj 8 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet2_online/ivectors_test_$year \
+         $graph_dir data/test_${year}_hires $dir/decode_${lm_suffix}_${year} || exit 1;
+    done
+  done
+fi
+
diff --git a/egs/wsj/s5/run.sh b/egs/wsj/s5/run.sh
index 492a6e26ab2..1cf8d86463f 100755
--- a/egs/wsj/s5/run.sh
+++ b/egs/wsj/s5/run.sh
@@ -108,7 +108,6 @@ for x in test_eval92 test_eval93 test_dev93 train_si284; do
  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
 done
 
-
 utils/subset_data_dir.sh --first data/train_si284 7138 data/train_si84 || exit 1
 
 # Now make subset with the shortest 2k utterances from si-84.
diff --git a/egs/wsj/s5/steps/nnet3/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/train_tdnn.sh
index 22420035ccc..50100c2299f 100755
--- a/egs/wsj/s5/steps/nnet3/train_tdnn.sh
+++ b/egs/wsj/s5/steps/nnet3/train_tdnn.sh
@@ -51,7 +51,7 @@ stage=-6
 exit_stage=-100 # you can set this to terminate the training early.  Exits before running this stage
 
 # count space-separated fields in splice_indexes to get num-hidden-layers.
-splice_indexes="-4,-3,-2,-1,0,1,2,3,4 0 -2,2 0 -4,4 0"
+splice_indexes="-4,-3,-2,-1,0,1,2,3,4  0  -2,2  0  -4,4 0"
 # Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
 # note: hidden layers which are composed of one or more components,
 # so hidden layer indexing is different from component count
@@ -62,7 +62,7 @@ randprune=4.0 # speeds up LDA.
 affine_opts=
 
 gpu=true    # if true, we run on GPU.
-cpu_num_threads=16  # if using CPU, the number of threads we use.
+num_threads=16  # if using CPU, the number of threads we use.
 combine_num_threads=8  # number of threads for the "combine" operation
 cleanup=true
 egs_dir=
@@ -257,7 +257,7 @@ if [ $stage -le -3 ]; then
 
   # Write stats with the same format as stats for LDA.
   $cmd JOB=1:$num_lda_jobs $dir/log/get_lda_stats.JOB.log \
-      nnet3-get-lda-stats --rand-prune=$rand_prune \
+      nnet3-acc-lda-stats --rand-prune=$rand_prune \
         $dir/init.raw $egs_dir/egs.JOB.ark $dir/JOB.lda_stats || exit 1;
 
   all_lda_accs=$(for n in $(seq $num_lda_jobs); do echo $dir/$n.lda_stats; done)
@@ -327,25 +327,25 @@ finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
 
 echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
 
-if [ $num_threads -eq 1 ]; then
+if $gpu; then
   parallel_suffix="-simple" # this enables us to use GPU code if
                             # we have just one thread.
+  train_queue_opt="--gpu 1"
   parallel_train_opts=
-  if !$gpu; then
-    train_gpu_opt="--gpu 1"
-    if ! cuda-compiled; then
-      echo "$0: WARNING: you are running with one thread but you have not compiled"
-      echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
-      echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
-      exit 1
-    fi
-  else
-    echo "$0: WARNING: running with 1 thread and no GPU: this will be slow."
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are running with one thread but you have not compiled"
+    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
+    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
+    exit 1
   fi
 else
-  $gpu && echo "$0: you must use --gpu false if you supply num-threads > 1" && exit 1;
-  parallel_suffix="-parallel"
-  parallel_train_opts="--num-threads=$num_threads"
+  if [ $num_threads -gt 1 ]; then
+    parallel_suffix="-parallel"
+    parallel_train_opts="--num-threads=$num_threads"
+    train_queue_opt="--num-threads $num_threads"
+  else
+    parallel_suffix="-simple"
+  fi
 fi
 
 
@@ -367,7 +367,6 @@ first_model_combine=$[$num_iters-$num_iters_combine+1]
 
 x=0
 
-
 for realign_time in $realign_times; do
   # Work out the iterations on which we will re-align, if the --realign-times
   # option was used.  This is slightly approximate.
@@ -407,7 +406,8 @@ while [ $x -lt $num_iters ]; do
       $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
         nnet3-copy-egs --srand=JOB --frame=random $context_opts ark:$prev_egs_dir/egs.1.ark ark:- \| \
         nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
-        nnet3-compute-from-egs "nnet3-to-raw $dir/$x.mdl -|" ark:- ark:- \| \
+        nnet3-merge-egs ark:- ark:- \| \
+        nnet3-compute-from-egs --apply-exp "nnet3-to-raw $dir/$x.mdl -|" ark:- ark:- \| \
         matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
 
       sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.
@@ -499,7 +499,7 @@ while [ $x -lt $num_iters ]; do
         # same archive with different frame indexes will give similar gradients,
         # so we want to separate them in time.
 
-        $cmd $train_gpu_opt $dir/log/train.$x.$n.log \
+        $cmd $train_queue_opt $dir/log/train.$x.$n.log \
           nnet3-train$parallel_suffix $parallel_train_opts --minibatch-size=$this_minibatch_size --srand=$x "$raw" \
           "ark:nnet3-copy-egs --frame=$frame $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size ark:- ark:- |" \
           $dir/$[$x+1].$n.raw || touch $dir/.error &
@@ -600,7 +600,8 @@ if [ $stage -le $[$num_iters+1] ]; then
   $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
     nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$cur_egs_dir/egs.1.ark ark:- \| \
     nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
-    nnet3-compute-from-egs "nnet3-am-copy --raw=true $dir/final.mdl -|" ark:- ark:- \| \
+    nnet3-merge-egs ark:- ark:- \| \
+    nnet3-compute-from-egs --apply-exp=true "nnet3-am-copy --raw=true $dir/final.mdl -|" ark:- ark:- \| \
     matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
 
   sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.
diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
index 1234306b753..8a3ba2d67af 100644
--- a/src/nnet3bin/Makefile
+++ b/src/nnet3bin/Makefile
@@ -7,7 +7,8 @@ LDFLAGS += $(CUDA_LDFLAGS)
 LDLIBS += $(CUDA_LDLIBS)
 
 BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
-    nnet3-shuffle-egs nnet3-get-lda-stats
+   nnet3-shuffle-egs nnet3-acc-lda-stats nnet3-merge-egs \
+   nnet3-compute-from-egs
 
 OBJFILES =
 
diff --git a/src/nnet3bin/nnet3-get-lda-stats.cc b/src/nnet3bin/nnet3-acc-lda-stats.cc
similarity index 97%
rename from src/nnet3bin/nnet3-get-lda-stats.cc
rename to src/nnet3bin/nnet3-acc-lda-stats.cc
index 409a0e8c926..b9b222fdec5 100644
--- a/src/nnet3bin/nnet3-get-lda-stats.cc
+++ b/src/nnet3bin/nnet3-acc-lda-stats.cc
@@ -1,4 +1,4 @@
-// nnet3bin/nnet3-get-lda-stats.cc
+// nnet3bin/nnet3-acc-lda-stats.cc
 
 // Copyright 2015  Johns Hopkins University (author: Daniel Povey)
 
@@ -125,16 +125,14 @@ int main(int argc, char *argv[]) {
         "training examples is used for the class labels.  Used in obtaining\n"
         "feature transforms that help nnet training work better.\n"
         "\n"
-        "Usage:  nnet3-get-lda-stats [options] <raw-nnet-in> <training-examples-in> <lda-stats-out>\n"
+        "Usage:  nnet3-acc-lda-stats [options] <raw-nnet-in> <training-examples-in> <lda-stats-out>\n"
         "e.g.:\n"
-        "nnet3-get-lda-stats 0.raw ark:1.egs 1.acc\n"
+        "nnet3-acc-lda-stats 0.raw ark:1.egs 1.acc\n"
         "See also: nnet-get-feature-transform\n";
     
     bool binary_write = true;
     BaseFloat rand_prune = 0.0;
 
-    LdaEstimate lda;
-    
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
     po.Register("rand-prune", &rand_prune,
diff --git a/src/nnet3bin/nnet3-compute-from-egs.cc b/src/nnet3bin/nnet3-compute-from-egs.cc
new file mode 100644
index 00000000000..35ecb6b0b8a
--- /dev/null
+++ b/src/nnet3bin/nnet3-compute-from-egs.cc
@@ -0,0 +1,125 @@
+// nnet3bin/nnet3-compute-from-egs.cc
+
+// Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-nnet.h"
+#include "nnet3/nnet-example-utils.h"
+#include "nnet3/nnet-optimize.h"
+#include "transform/lda-estimate.h"
+
+
+namespace kaldi {
+namespace nnet3 {
+
+class NnetComputerFromEg {
+ public:
+  NnetComputerFromEg(const Nnet &nnet):
+      nnet_(nnet), compiler_(nnet) { }
+
+  // Compute the output (which will have the same number of rows as the number
+  // of Indexes in the output of the eg), and put it in "output".
+  void Compute(const NnetExample &eg, Matrix<BaseFloat> *output) {
+    ComputationRequest request;
+    bool need_backprop = false, store_stats = false;
+    GetComputationRequest(nnet_, eg, need_backprop, store_stats, &request);
+    const NnetComputation &computation = *(compiler_.Compile(request));
+    NnetComputeOptions options;
+    if (GetVerboseLevel() >= 3)
+      options.debug = true;
+    NnetComputer computer(options, computation, nnet_, NULL);
+    computer.AcceptInputs(nnet_, eg);
+    computer.Forward();
+    const CuMatrixBase<BaseFloat> &nnet_output = computer.GetOutput("output");
+    output->Resize(nnet_output.NumRows(), nnet_output.NumCols());
+    nnet_output.CopyToMat(output);
+  }
+ private:
+  const Nnet &nnet_;
+  CachingOptimizingCompiler compiler_;
+  
+};
+
+}
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Read input nnet training examples, and compute the output for each one.\n"
+        "If --apply-exp=true, apply the Exp() function to the output before writing\n"
+        "it out.\n"
+        "\n"
+        "Usage:  nnet3-compute-from-egs [options] <raw-nnet-in> <training-examples-in> <matrices-out>\n"
+        "e.g.:\n"
+        "nnet3-compute-from-egs --apply-exp=true 0.raw ark:1.egs ark:- | matrix-sum-rows ark:- ... \n"
+        "See also: nnet3-compute\n";
+    
+    bool binary_write = true,
+        apply_exp = false;
+
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("apply-exp", &apply_exp, "If true, apply exp function to "
+                "output");
+
+    po.Read(argc, argv);
+    
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+    std::string nnet_rxfilename = po.GetArg(1),
+        examples_rspecifier = po.GetArg(2),
+        matrix_wspecifier = po.GetArg(3);
+
+    Nnet nnet;
+    ReadKaldiObject(nnet_rxfilename, &nnet);
+
+    NnetComputerFromEg computer(nnet);
+
+    int64 num_egs = 0;
+    
+    SequentialNnetExampleReader example_reader(examples_rspecifier);
+    BaseFloatMatrixWriter matrix_writer(matrix_wspecifier);
+    
+    for (; !example_reader.Done(); example_reader.Next(), num_egs++) {
+      Matrix<BaseFloat> output;
+      computer.Compute(example_reader.Value(), &output);
+      KALDI_ASSERT(output.NumRows() != 0);
+      if (apply_exp)
+        output.ApplyExp();
+      matrix_writer.Write(example_reader.Key(), output);
+    }
+    KALDI_LOG << "Processed " << num_egs << " examples.";
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
diff --git a/src/nnet3bin/nnet3-copy-egs.cc b/src/nnet3bin/nnet3-copy-egs.cc
index daa83789868..e521e3a5498 100644
--- a/src/nnet3bin/nnet3-copy-egs.cc
+++ b/src/nnet3bin/nnet3-copy-egs.cc
@@ -253,9 +253,9 @@ int main(int argc, char *argv[]) {
         "Usage:  nnet3-copy-egs [options] <egs-rspecifier> <egs-wspecifier1> [<egs-wspecifier2> ...]\n"
         "\n"
         "e.g.\n"
-        "nnet-copy-egs ark:train.egs ark,t:text.egs\n"
+        "nnet3-copy-egs ark:train.egs ark,t:text.egs\n"
         "or:\n"
-        "nnet-copy-egs ark:train.egs ark:1.egs ark:2.egs\n";
+        "nnet3-copy-egs ark:train.egs ark:1.egs ark:2.egs\n";
         
     bool random = false;
     int32 srand_seed = 0;
diff --git a/src/nnet3bin/nnet3-merge-egs.cc b/src/nnet3bin/nnet3-merge-egs.cc
new file mode 100644
index 00000000000..352f67a7b70
--- /dev/null
+++ b/src/nnet3bin/nnet3-merge-egs.cc
@@ -0,0 +1,127 @@
+// nnet3bin/nnet3-merge-egs.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//                2014  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-example.h"
+#include "nnet3/nnet-example-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+// returns the number of indexes/frames in the NnetIo named "output" in the eg,
+// or crashes if it is not there.
+int32 NumOutputIndexes(const NnetExample &eg) {
+  for (size_t i = 0; i < eg.io.size(); i++)
+    if (eg.io[i].name == "output")
+      return eg.io[i].indexes.size();
+  KALDI_ERR << "No output named 'output' in the eg.";
+  return 0;  // Suppress compiler warning.
+}
+
+}
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "This copies nnet training examples from input to output, but while doing so it\n"
+        "merges many NnetExample objects into one, forming a minibatch consisting of a\n"
+        "single NnetExample.  Note: if --measure-output-frames=true, which it is by default,\n"
+        "the --minibatch-size option will be interpreted as a target number of output frames;\n"
+        "otherwise as a number of input examples to combine.  This makes a difference\n"
+        "if the input examples have multiple supervised frames in them.\n"
+        "\n"
+        "Usage:  nnet3-merge-egs [options] <egs-rspecifier> <egs-wspecifier>\n"
+        "e.g.\n"
+        "nnet3-merge-egs --minibatch-size=512 ark:1.egs ark:- | nnet3-train-simple ... \n"
+        "See also nnet-copy-egs\n";
+        
+    bool compress = false;
+    int32 minibatch_size = 512;
+    bool measure_output_frames = true;
+
+    ParseOptions po(usage);
+    po.Register("minibatch-size", &minibatch_size, "Target size of minibatches "
+                "when merging (see also --measure-output-frames)");
+    po.Register("measure-output-frames", &measure_output_frames, "If true, "
+                "--minibatch-size is a target number of total output frames; if "
+                "false, --minibatch-size is the number of input examples to "
+                "merge.");
+    po.Register("compress", &compress, "If true, compress the output examples "
+                "(not recommended unless you are writing to disk");
+    
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string examples_rspecifier = po.GetArg(1),
+        examples_wspecifier = po.GetArg(2);
+
+    SequentialNnetExampleReader example_reader(examples_rspecifier);
+    NnetExampleWriter example_writer(examples_wspecifier);
+    
+    std::vector<NnetExample> examples;
+    examples.reserve(minibatch_size);
+
+    int32 cur_num_output_frames = 0;
+    
+    int64 num_read = 0, num_written = 0;
+    while (!example_reader.Done()) {
+      const NnetExample &cur_eg = example_reader.Value();
+      examples.resize(examples.size() + 1);
+      examples.back() = cur_eg;
+      cur_num_output_frames += NumOutputIndexes(cur_eg);
+      bool minibatch_ready =
+          (measure_output_frames ?
+           cur_num_output_frames >= minibatch_size :
+           static_cast<int32>(examples.size()) >= minibatch_size);
+
+      // Do Next() now, so we can test example_reader.Done() below .
+      example_reader.Next();
+      num_read++;
+      
+      if (minibatch_ready || (example_reader.Done() && !examples.empty())) {
+        NnetExample merged_eg;
+        MergeExamples(examples, compress, &merged_eg);
+        std::ostringstream ostr;
+        ostr << "merged-" << num_written;
+        num_written++;
+        std::string output_key = ostr.str();
+        example_writer.Write(output_key, merged_eg);
+      }
+    }
+    KALDI_LOG << "Merged " << num_read << " egs to " << num_written << '.';
+    return (num_written != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+