diff --git a/egs/hkust/s5/RESULTS b/egs/hkust/s5/RESULTS
index 9447f40dd39..674dcea38d1 100644
--- a/egs/hkust/s5/RESULTS
+++ b/egs/hkust/s5/RESULTS
@@ -7,3 +7,6 @@ exp/tri5a/decode/cer_13:%WER 49.67 [ 27891 / 56154, 2877 ins, 4538 del, 20476 su
 exp/tri5a_mce/decode/cer_11:%WER 44.74 [ 25125 / 56154, 2112 ins, 4108 del, 18905 sub ]
 exp/tri5a_mmi_b0.1/decode/cer_11:%WER 44.24 [ 24840 / 56154, 2060 ins, 4118 del, 18662 sub ]
 exp/tri5a_mpe/decode/cer_12:%WER 44.96 [ 25247 / 56154, 2233 ins, 4174 del, 18840 sub ]
+
+# ConvNet with 2 convolutional layers and 2 ReLU layers
+exp/nnet2_convnet/decode/cer_10:%WER 40.73 [ 22873 / 56154, 2609 ins, 3712 del, 16552 sub ]
diff --git a/egs/hkust/s5/local/nnet2/run_convnet.sh b/egs/hkust/s5/local/nnet2/run_convnet.sh
new file mode 100755
index 00000000000..f5baab0dc5d
--- /dev/null
+++ b/egs/hkust/s5/local/nnet2/run_convnet.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+# 2015 Xingyu Na
+# This script runs on the full training set, using ConvNet setup on top of
+# fbank features, on GPU. The ConvNet has four hidden layers, two convolutional
+# layers and two affine transform layers with ReLU nonlinearity.
+# Convolutional layer [1]:
+#   convolution1d, input feature dim is 36, filter dim is 7, output dim is
+#   30, 128 filters are used
+#   maxpooling, 3-to-1 maxpooling, input dim is 30, output dim is 10
+# Convolutional layer [2]:
+#   convolution1d, input feature dim is 10, filter dim is 4, output dim is
+#   7, 256 filters are used
+# Affine transform layers [3-4]:
+#   affine transform with ReLU nonlinearity.
+
+temp_dir=
+dir=exp/nnet2_convnet
+stage=-5
+train_original=data/train
+train=data-fb/train
+
+. ./cmd.sh
+. ./path.sh
+
+. utils/parse_options.sh
+
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll
+                         # likely have to change it.
+
+# Make the FBANK features
+if [ $stage -le -5 ]; then
+  # Dev set
+  utils/copy_data_dir.sh data/dev data-fb/dev || exit 1; rm $train/{cmvn,feats}.scp
+  steps/make_fbank.sh --nj 10 --cmd "$train_cmd" \
+     data-fb/dev data-fb/dev/log data-fb/dev/data || exit 1;
+  steps/compute_cmvn_stats.sh data-fb/dev data-fb/dev/log data-fb/dev/data || exit 1;
+  # Training set
+  utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp
+  steps/make_fbank.sh --nj 10 --cmd "$train_cmd" \
+     $train $train/log $train/data || exit 1;
+  steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1;
+fi
+
+( 
+  if [ ! -f $dir/final.mdl ]; then
+    steps/nnet2/train_convnet_accel2.sh --parallel-opts "$parallel_opts" \
+      --cmd "$decode_cmd" --stage $stage \
+      --num-threads 1 --minibatch-size 512 \
+      --mix-up 20000 --samples-per-iter 300000 \
+      --num-epochs 15 --delta-order 2 \
+      --initial-effective-lrate 0.0005 --final-effective-lrate 0.000025 \
+      --num-jobs-initial 3 --num-jobs-final 8 --splice-width 5 \
+      --hidden-dim 2000 --num-filters1 128 --patch-dim1 7 --pool-size 3 \
+      --num-filters2 256 --patch-dim2 4 \
+      $train data/lang exp/tri5a_ali $dir || exit 1;
+  fi
+
+  steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \
+    --config conf/decode.config \
+    exp/tri5a/graph data-fb/dev \
+    $dir/decode || exit 1;
+)
diff --git a/egs/wsj/s5/steps/nnet2/decode.sh b/egs/wsj/s5/steps/nnet2/decode.sh
index df8600df32b..753411f4563 100755
--- a/egs/wsj/s5/steps/nnet2/decode.sh
+++ b/egs/wsj/s5/steps/nnet2/decode.sh
@@ -84,7 +84,12 @@ fi
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
 
 case $feat_type in
-  raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";;
+  raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+  if [ -f $srcdir/delta_order ]; then
+    delta_order=`cat $srcdir/delta_order 2>/dev/null`
+    feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |"
+  fi
+    ;;
   lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
     ;;
   *) echo "$0: invalid feature type $feat_type" && exit 1;
diff --git a/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh b/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh
new file mode 100755
index 00000000000..1c34749ba7f
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh
@@ -0,0 +1,674 @@
+#!/bin/bash
+
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+#                2013  Xiaohui Zhang
+#                2013  Guoguo Chen
+#                2014  Vimal Manohar
+# Apache 2.0.
+
+# train_convnet_accel2.sh is modified from train_pnorm_accel2.sh. It propotypes
+# the training of a ConvNet. The ConvNet is composed of 4 layers. The first layer
+# is a Convolutional1d component plus a Maxpooling component. The second layer
+# is a single Convolutional1d component. The third and fourth layers are affine
+# components with ReLU nonlinearities. Due to non-squashing output, normalize
+# component is applied to all four layers.
+
+# train_pnorm_accel2.sh is a modified form of train_pnorm_simple2.sh (the "2"
+# suffix is because they both use the the "new" egs format, created by
+# get_egs2.sh).  The "accel" part of the name refers to the fact that this
+# script uses a number of jobs that can increase during training.  You can
+# specify --initial-num-jobs and --final-num-jobs to control these separately.
+# Also, in this script, the learning rates specified by --initial-learning-rate
+# and --final-learning-rate are the "effective learning rates" (defined as the
+# learning rate divided by the number of jobs), and the actual learning rates
+# used will be the specified learning rates multiplied by the current number
+# of jobs.  You'll want to set these lower than you normally would previously
+# have set the learning rates, by a factor equal to the (previous) number of
+# jobs.
+
+
+# Begin configuration section.
+cmd=run.pl
+num_epochs=15      # Number of epochs of training;
+                   # the number of iterations is worked out from this.
+initial_effective_lrate=0.01
+final_effective_lrate=0.001
+bias_stddev=0.5
+hidden_dim=3000 
+minibatch_size=128 # by default use a smallish minibatch size for neural net
+                   # training; this controls instability which would otherwise
+                   # be a problem with multi-threaded update. 
+
+samples_per_iter=400000 # each iteration of training, see this many samples
+                        # per job.  This option is passed to get_egs.sh
+num_jobs_initial=1    # Number of neural net jobs to run in parallel at the start of training.
+num_jobs_final=8      # Number of jobs to run in parallel at the end of training.
+
+prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
+                        # more than enough.
+num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
+get_egs_stage=0
+online_ivector_dir=
+
+
+max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
+  # to the final 'combine' stage, but these models will themselves be averages of
+  # iteration-number ranges.
+
+shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
+                # on each iter.  You could set it to 0 or to a large value for complete
+                # randomization, but this would both consume memory and cause spikes in
+                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
+                # not a huge deal though, as samples are anyway randomized right at the start.
+                # (the point of this is to get data in different minibatches on different iterations,
+                # since in the preconditioning method, 2 samples in the same minibatch can
+                # affect each others' gradients.
+
+add_layers_period=2 # by default, add new layers every 2 iterations.
+stage=-3
+
+splice_width=4 # meaning +- 4 frames on each side for second LDA
+left_context= # if set, overrides splice-width
+right_context= # if set, overrides splice-width.
+randprune=4.0 # speeds up LDA.
+alpha=4.0 # relates to preconditioning.
+update_period=4 # relates to online preconditioning: says how often we update the subspace.
+num_samples_history=2000 # relates to online preconditioning
+max_change_per_sample=0.075
+precondition_rank_in=20  # relates to online preconditioning
+precondition_rank_out=80 # relates to online preconditioning
+
+num_filters1=128      # number of filters in the first convolutional layer
+patch_step1=1         # patch step of the first convolutional layer
+patch_dim1=7          # dim of convolutional kernel in the first layer
+pool_size=3           # size of pooling after the first convolutional layer
+num_filters2=256      # number of filters in the second convolutional layer
+patch_dim2=4          # dim of convolutional kernel in the second layer
+
+mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
+        # specified.)
+num_threads=16
+parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G" 
+  # by default we use 16 threads; this lets the queue know.
+  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
+combine_num_threads=8
+combine_parallel_opts="-pe smp 8"  # queue options for the "combine" stage.
+cleanup=true
+egs_dir=
+lda_opts=
+lda_dim=
+egs_opts=
+delta_order=
+io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.
+transform_dir=     # If supplied, overrides alidir
+postdir=
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+            # only relevant for "raw" features, not lda.
+feat_type=  # Can be used to force "raw" features.
+align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
+align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
+realign_times=          # List of times on which we realign.  Each time is 
+                        # floating point number strictly between 0 and 1, which
+                        # will be multiplied by the num-iters to get an iteration
+                        # number.
+num_jobs_align=30       # Number of jobs for realignment
+srand=0 # random seed used to initialize the nnet
+# End configuration section.
+
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
+  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
+  echo "  --initial-effective-lrate <lrate|0.02> # effective learning rate at start of training,"
+  echo "                                         # actual learning-rate is this time num-jobs."
+  echo "  --final-effective-lrate <lrate|0.004>   # effective learning rate at end of training."
+  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
+  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
+  echo "                                                   # per context-dependent state.  Try a number several times #states."
+  echo "  --num-jobs-initial <num-jobs|1>                  # Number of parallel jobs to use for neural net training, at the start."
+  echo "  --num-jobs-final <num-jobs|8>                    # Number of parallel jobs to use for neural net training, at the end"
+  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
+  echo "                                                   # as well as speed; may interact with batch size; if you increase"
+  echo "                                                   # this, you may want to decrease the batch size."
+  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
+  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
+  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
+  echo "                                                   # should not get too large, e.g. >2k)."
+  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
+  echo "                                                   # process."
+  echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
+  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
+  echo "  --realign-epochs <list-of-epochs|\"\">           # A list of space-separated epoch indices the beginning of which"
+  echo "                                                   # realignment is to be done"
+  echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
+  echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
+  echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
+  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+  echo "ConvNet configurations"
+  echo "  --num-filters1 <num-filters1|128>                # number of filters in the first convolutional layer."
+  echo "  --patch-step1 <patch-step1|1>                    # patch step of the first convolutional layer."
+  echo "  --patch-dim1 <patch-dim1|7>                      # dim of convolutional kernel in the first layer."
+  echo "                                                   # (note: (feat-dim - patch-dim1) % patch-step1 should be 0.)"
+  echo "  --pool-size <pool-size|3>                        # size of pooling after the first convolutional layer."
+  echo "                                                   # (note: (feat-dim - patch-dim1 + 1) % pool-size should be 0.)"
+  echo "  --num-filters2 <num-filters2|256>                # number of filters in the second convolutional layer."
+  echo "  --patch-dim2 <patch-dim2|4>                      # dim of convolutional kernel in the second layer."
+
+  
+  exit 1;
+fi
+
+data=$1
+lang=$2
+alidir=$3
+dir=$4
+
+if [ ! -z "$realign_times" ]; then
+  [ -z "$align_cmd" ] && echo "$0: realign_times specified but align_cmd not specified" && exit 1
+  [ -z "$align_use_gpu" ] && echo "$0: realign_times specified but align_use_gpu not specified" && exit 1
+fi
+
+# Check some files.
+for f in $data/feats.scp $lang/L.fst $alidir/final.mdl $alidir/tree; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+[ ! -f $postdir/post.1.scp ] && [ ! -f $alidir/ali.1.gz ] && echo "$0: no (soft) alignments provided" && exit 1;
+
+trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
+
+# Set some variables.
+num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
+[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
+[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1
+
+nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
+# in this dir we'll have just one job.
+sdata=$data/split$nj
+utils/split_data.sh $data $nj
+
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+cp $alidir/tree $dir
+
+extra_opts=()
+[ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
+[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
+[ ! -z "$delta_order" ] && extra_opts+=(--delta-order $delta_order)
+[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
+[ -z "$transform_dir" ] && transform_dir=$alidir
+extra_opts+=(--transform-dir $transform_dir)
+[ -z "$left_context" ] && left_context=$splice_width
+[ -z "$right_context" ] && right_context=$splice_width
+extra_opts+=(--left-context $left_context --right-context $right_context)
+
+feat-to-dim scp:$sdata/1/feats.scp - > $dir/feat_dim
+feat_dim=$(cat $dir/feat_dim) || exit 1;
+
+if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
+  echo "$0: calling get_egs2.sh"            
+  steps/nnet2/get_egs2.sh $egs_opts "${extra_opts[@]}"  --io-opts "$io_opts" \
+    --postdir "$postdir" --samples-per-iter $samples_per_iter --stage $get_egs_stage \
+    --cmd "$cmd" --feat-type "raw" $data $alidir $dir/egs || exit 1;
+fi
+
+if [ -f $dir/egs/cmvn_opts ]; then
+  cp $dir/egs/cmvn_opts $dir
+fi
+
+if [ -f $dir/egs/delta_order ]; then
+  cp $dir/egs/delta_order $dir
+fi
+
+if [ -z $egs_dir ]; then
+  egs_dir=$dir/egs
+fi
+
+frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
+num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
+
+# num_archives_expanded considers each separate label-position from
+# 0..frames_per_eg-1 to be a separate archive.
+num_archives_expanded=$[$num_archives*$frames_per_eg]
+
+[ $num_jobs_initial -gt $num_jobs_final ] && \
+  echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;
+
+[ $num_jobs_final -gt $num_archives_expanded ] && \
+  echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1;
+
+if ! [ $num_hidden_layers -ge 1 ]; then
+  echo "Invalid num-hidden-layers $num_hidden_layers"
+  exit 1
+fi
+
+if [ $stage -le -2 ]; then
+  echo "$0: initializing neural net";
+  tot_splice=$[($delta_order+1)*($left_context+1+$right_context)]
+  delta_feat_dim=$[($delta_order+1)*$feat_dim]
+  tot_input_dim=$[$feat_dim*$tot_splice]
+  num_patch1=$[1+($feat_dim-$patch_dim1)/$patch_step1]
+  num_pool=$[$num_patch1/$pool_size]
+  patch_dim2=$[$patch_dim2*$num_filters1]
+  patch_step2=$num_filters1
+  patch_stride2=$[$num_pool*$num_filters1]   # same as pool outputs
+  num_patch2=$[1+($num_pool*$num_filters1-$patch_dim2)/$patch_step2]
+  conv_out_dim1=$[$num_filters1*$num_patch1] # 128 x (36 - 7 + 1)
+  pool_out_dim=$[$num_filters1*$num_pool]
+  conv_out_dim2=$[$num_filters2*$num_patch2]
+  
+  online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"
+
+  initial_lrate=$(perl -e "print ($initial_effective_lrate*$num_jobs_initial);")
+  stddev=`perl -e "print 1.0/sqrt($hidden_dim);"`
+  cat >$dir/nnet.config <<EOF
+SpliceComponent input-dim=$delta_feat_dim left-context=$left_context right-context=$right_context
+Convolutional1dComponent input-dim=$tot_input_dim output-dim=$conv_out_dim1 learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev patch-dim=$patch_dim1 patch-step=$patch_step1 patch-stride=$feat_dim
+MaxpoolingComponent input-dim=$conv_out_dim1 output-dim=$pool_out_dim pool-size=$pool_size pool-stride=$num_filters1
+NormalizeComponent dim=$pool_out_dim
+AffineComponentPreconditionedOnline input-dim=$pool_out_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=0 bias-stddev=0
+SoftmaxComponent dim=$num_leaves
+EOF
+  
+  cat >$dir/replace.1.config <<EOF
+Convolutional1dComponent input-dim=$pool_out_dim output-dim=$conv_out_dim2 learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev patch-dim=$patch_dim2 patch-step=$patch_step2 patch-stride=$patch_stride2
+NormalizeComponent dim=$conv_out_dim2
+AffineComponentPreconditionedOnline input-dim=$conv_out_dim2 output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=0 bias-stddev=0
+SoftmaxComponent dim=$num_leaves
+EOF
+
+  cat >$dir/replace.2.config <<EOF
+AffineComponentPreconditionedOnline input-dim=$conv_out_dim2 output-dim=$hidden_dim $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev
+RectifiedLinearComponent dim=$hidden_dim
+NormalizeComponent dim=$hidden_dim
+AffineComponentPreconditionedOnline input-dim=$hidden_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=0 bias-stddev=0
+SoftmaxComponent dim=$num_leaves
+EOF
+
+  # to hidden.config it will write the part of the config corresponding to a
+  # single hidden layer; we need this to add new layers. 
+  cat >$dir/replace.3.config <<EOF
+AffineComponentPreconditionedOnline input-dim=$hidden_dim output-dim=$hidden_dim $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev
+RectifiedLinearComponent dim=$hidden_dim
+NormalizeComponent dim=$hidden_dim
+AffineComponentPreconditionedOnline input-dim=$hidden_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=0 bias-stddev=0
+SoftmaxComponent dim=$num_leaves
+EOF
+
+  $cmd $dir/log/nnet_init.log \
+    nnet-am-init $alidir/tree $lang/topo "nnet-init --srand=$srand $dir/nnet.config -|" \
+    $dir/0.mdl || exit 1;
+fi
+
+if [ $stage -le -1 ]; then
+  echo "Training transition probabilities and setting priors"
+  $cmd $dir/log/train_trans.log \
+    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
+    || exit 1;
+fi
+
+# set num_iters so that as close as possible, we process the data $num_epochs
+# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded,
+# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+
+num_archives_to_process=$[$num_epochs*$num_archives_expanded]
+num_archives_processed=0
+num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]
+
+echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
+
+finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
+
+! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \
+  && echo "$0: Insufficient epochs" && exit 1
+
+# mix up at the iteration where we've processed about half the data; this keeps
+# the overall training procedure fairly invariant to the number of initial and
+# final jobs.
+# j = initial, k = final, n = num-iters, x = half-of-data epoch,
+# p is proportion of data we want to process (e.g. p=0.5 here).
+# solve for x if the amount of data processed by epoch x is p
+# times the amount by iteration n.
+# put this in wolfram alpha:
+# solve { x*j + (k-j)*x*x/(2*n) = p * (j*n + (k-j)*n/2), {x} }
+# got: x = (j n-sqrt(-n^2 (j^2 (p-1)-k^2 p)))/(j-k) and j!=k and n!=0
+# simplified manually to: n * (sqrt(((1-p)j^2 + p k^2)/2) - j)/(j-k)
+mix_up_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters 0.5)
+! [ $mix_up_iter -gt $finish_add_layers_iter ] && \
+  echo "Mix-up-iter is $mix_up_iter, should be greater than $finish_add_layers_iter -> add more epochs?" \
+  && exit 1;
+
+if [ $num_threads -eq 1 ]; then
+  parallel_suffix="-simple" # this enables us to use GPU code if
+                         # we have just one thread.
+  parallel_train_opts=
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are running with one thread but you have not compiled"
+    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
+    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
+  fi
+else
+  parallel_suffix="-parallel"
+  parallel_train_opts="--num-threads=$num_threads"
+fi
+
+
+approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final]
+
+# First work out how many models we want to combine over in the final
+# nnet-combine-fast invocation.  This equals
+# min(max(max_models_combine, approx_iters_per_epoch_final),
+#     2/3 * iters_after_mixup)
+num_models_combine=$max_models_combine
+if [ $num_models_combine -lt $approx_iters_per_epoch_final ]; then
+  num_models_combine=$approx_iters_per_epoch_final
+fi
+iters_after_mixup_23=$[(($num_iters-$mix_up_iter-1)*2)/3]
+if [ $num_models_combine -gt $iters_after_mixup_23 ]; then
+  num_models_combine=$iters_after_mixup_23
+fi
+first_model_combine=$[$num_iters-$num_models_combine+1]
+
+x=0
+
+
+for realign_time in $realign_times; do
+  # Work out the iterations on which we will re-align, if the --realign-times
+  # option was used.  This is slightly approximate.
+  ! perl -e "exit($realign_time > 0.0 && $realign_time < 1.0 ? 0:1);" && \
+    echo "Invalid --realign-times option $realign_times: elements must be strictly between 0 and 1.";
+  # the next formula is based on the one for mix_up_iter above.
+  realign_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters $realign_time) || exit 1;
+  realign_this_iter[$realign_iter]=$realign_time
+done
+
+cur_egs_dir=$egs_dir
+num_hid_added=1
+while [ $x -lt $num_iters ]; do
+  this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")
+
+  ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
+  this_learning_rate=$(perl -e  "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");
+
+  # TODO: remove this line.
+  echo "On iteration $x, learning rate is $this_learning_rate."
+    
+  if [ ! -z "${realign_this_iter[$x]}" ]; then
+    prev_egs_dir=$cur_egs_dir
+    cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
+  fi
+
+  if [ $x -ge 0 ] && [ $stage -le $x ]; then
+    if [ ! -z "${realign_this_iter[$x]}" ]; then
+      time=${realign_this_iter[$x]}
+
+             
+
+      echo "Getting average posterior for purposes of adjusting the priors."
+      # Note: this just uses CPUs, using a smallish subset of data.
+      # always use the first egs archive, which makes the script simpler;
+      # we're using different random subsets of it.
+      rm $dir/post.$x.*.vec 2>/dev/null
+      $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
+        nnet-copy-egs --srand=JOB --frame=random ark:$prev_egs_dir/egs.1.ark ark:- \| \
+        nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
+        nnet-compute-from-egs "nnet-to-raw-nnet $dir/$x.mdl -|" ark:- ark:- \| \
+        matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
+
+      sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.
+
+      $cmd $dir/log/vector_sum.$x.log \
+        vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
+      rm $dir/post.$x.*.vec;
+
+      echo "Re-adjusting priors based on computed posteriors"
+      $cmd $dir/log/adjust_priors.$x.log \
+        nnet-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1;
+
+      sleep 2
+
+      steps/nnet2/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \
+        --transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \
+        --iter $x $data $lang $dir $dir/ali_$time || exit 1
+
+      steps/nnet2/relabel_egs2.sh --cmd "$cmd" --iter $x $dir/ali_$time \
+        $prev_egs_dir $cur_egs_dir || exit 1
+
+      if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then
+        steps/nnet2/remove_egs.sh $prev_egs_dir
+      fi
+    fi
+    
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    $cmd $dir/log/compute_prob_valid.$x.log \
+      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
+    $cmd $dir/log/compute_prob_train.$x.log \
+      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/train_diagnostic.egs &
+    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
+      [ ! -f $x.mdl ] && sleep 10; 
+      $cmd $dir/log/progress.$x.log \
+        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
+        ark:$cur_egs_dir/train_diagnostic.egs '&&' \
+        nnet-am-info $dir/$x.mdl &
+    fi
+
+    echo "Training neural net (pass $x)"
+
+    if [ $x -gt 0 ] && \
+      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
+      [ $[($x-1) % $add_layers_period] -eq 0 ]; then
+      do_average=false # if we've just mixed up, don't do averaging take the best.
+      mdl="nnet-init --srand=$x $dir/replace.$num_hid_added.config - | nnet-replace-last-layers $dir/$x.mdl - - | nnet-am-copy --learning-rate=$this_learning_rate - -|"
+      num_hid_added=$[$num_hid_added+1]
+    else
+      do_average=true
+      if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
+      mdl="nnet-am-copy --learning-rate=$this_learning_rate $dir/$x.mdl -|"
+    fi
+    if $do_average; then
+      this_minibatch_size=$minibatch_size
+    else
+      # on iteration zero or when we just added a layer, use a smaller minibatch
+      # size and just one job: the model-averaging doesn't seem to be helpful
+      # when the model is changing too fast (i.e. it worsens the objective
+      # function), and the smaller minibatch size will help to keep
+      # the update stable.
+      this_minibatch_size=$[$minibatch_size/2];
+    fi
+
+    rm $dir/.error 2>/dev/null
+
+    ( # this sub-shell is so that when we "wait" below,
+      # we only wait for the training jobs that we just spawned,
+      # not the diagnostic jobs that we spawned above.
+      
+      # We can't easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      for n in $(seq $this_num_jobs); do
+        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
+                                         # the other indexes from.
+        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
+        frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame
+        # index; this increases more slowly than the archive index because the
+        # same archive with different frame indexes will give similar gradients,
+        # so we want to separate them in time.
+
+        $cmd $parallel_opts $dir/log/train.$x.$n.log \
+          nnet-train$parallel_suffix $parallel_train_opts \
+          --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
+          "ark:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
+          $dir/$[$x+1].$n.mdl || touch $dir/.error &
+      done
+      wait
+    )
+    # the error message below is not that informative, but $cmd will
+    # have printed a more specific one.
+    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;
+
+    nnets_list=
+    for n in `seq 1 $this_num_jobs`; do
+      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
+    done
+
+    if $do_average; then
+      # average the output of the different jobs.
+      $cmd $dir/log/average.$x.log \
+        nnet-am-average $nnets_list $dir/$[$x+1].mdl ||  exit 1;
+    else
+      # choose the best from the different jobs.
+      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
+          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
+          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
+      [ -z "$n" ] && echo "Error getting best model" && exit 1;
+      cp $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
+    fi
+
+    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
+      # mix up.
+      echo Mixing up from $num_leaves to $mix_up components
+      $cmd $dir/log/mix_up.$x.log \
+        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
+        $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
+    fi
+    rm $nnets_list
+    [ ! -f $dir/$[$x+1].mdl ] && exit 1;
+    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
+       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
+      rm $dir/$[$x-1].mdl
+    fi
+  fi
+  x=$[$x+1]
+  num_archives_processed=$[$num_archives_processed+$this_num_jobs]
+done
+
+
+if [ $stage -le $num_iters ]; then
+  echo "Doing final combination to produce final.mdl"
+
+  # Now do combination.
+  nnets_list=()
+  # the if..else..fi statement below sets 'nnets_list'.
+  if [ $max_models_combine -lt $num_models_combine ]; then
+    # The number of models to combine is too large, e.g. > 20.  In this case,
+    # each argument to nnet-combine-fast will be an average of multiple models.
+    cur_offset=0 # current offset from first_model_combine.
+    for n in $(seq $max_models_combine); do
+      next_offset=$[($n*$num_models_combine)/$max_models_combine]
+      sub_list="" 
+      for o in $(seq $cur_offset $[$next_offset-1]); do
+        iter=$[$first_model_combine+$o]
+        mdl=$dir/$iter.mdl
+        [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
+        sub_list="$sub_list $mdl"
+      done
+      nnets_list[$[$n-1]]="nnet-am-average $sub_list - |"
+      cur_offset=$next_offset
+    done
+  else
+    nnets_list=
+    for n in $(seq 0 $[num_models_combine-1]); do
+      iter=$[$first_model_combine+$n]
+      mdl=$dir/$iter.mdl
+      [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
+      nnets_list[$n]=$mdl
+    done
+  fi
+
+
+  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
+  # if there are many models it can give out-of-memory error; set num-threads to 8
+  # to speed it up (this isn't ideal...)
+  num_egs=`nnet-copy-egs ark:$cur_egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
+  mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
+  [ $mb -gt 512 ] && mb=512
+  # Setting --initial-model to a large value makes it initialize the combination
+  # with the average of all the models.  It's important not to start with a
+  # single model, or, due to the invariance to scaling that these nonlinearities
+  # give us, we get zero diagonal entries in the fisher matrix that
+  # nnet-combine-fast uses for scaling, which after flooring and inversion, has
+  # the effect that the initial model chosen gets much higher learning rates
+  # than the others.  This prevents the optimization from working well.
+  $cmd $combine_parallel_opts $dir/log/combine.log \
+    nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
+      --num-threads=$combine_num_threads \
+      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$cur_egs_dir/combine.egs \
+      $dir/final.mdl || exit 1;
+
+  # Normalize stddev for affine or block affine layers that are followed by a
+  # ReLU layer and then a normalize layer.
+  $cmd $dir/log/normalize.log \
+    nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;
+
+  # Compute the probability of the final, combined model with
+  # the same subset we used for the previous compute_probs, as the
+  # different subsets will lead to different probs.
+  $cmd $dir/log/compute_prob_valid.final.log \
+    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
+  $cmd $dir/log/compute_prob_train.final.log \
+    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/train_diagnostic.egs &
+fi
+
+if [ $stage -le $[$num_iters+1] ]; then
+  echo "Getting average posterior for purposes of adjusting the priors."
+  # Note: this just uses CPUs, using a smallish subset of data.
+  rm $dir/post.$x.*.vec 2>/dev/null
+  $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
+    nnet-copy-egs --frame=random --srand=JOB ark:$cur_egs_dir/egs.1.ark ark:- \| \
+    nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
+    nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
+    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
+
+  sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.
+
+  $cmd $dir/log/vector_sum.$x.log \
+   vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
+
+  rm $dir/post.$x.*.vec;
+
+  echo "Re-adjusting priors based on computed posteriors"
+  $cmd $dir/log/adjust_priors.final.log \
+    nnet-adjust-priors $dir/final.mdl $dir/post.$x.vec $dir/final.mdl || exit 1;
+fi
+
+
+if [ ! -f $dir/final.mdl ]; then
+  echo "$0: $dir/final.mdl does not exist."
+  # we don't want to clean up if the training didn't succeed.
+  exit 1;
+fi
+
+sleep 2
+
+echo Done
+
+if $cleanup; then
+  echo Cleaning up data
+  if [[ $cur_egs_dir =~ $dir/egs* ]]; then
+    steps/nnet2/remove_egs.sh $cur_egs_dir
+  fi
+
+  echo Removing most of the models
+  for x in `seq 0 $num_iters`; do
+    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
+       # delete all but every 100th model; don't delete the ones which combine to form the final model.
+      rm $dir/$x.mdl
+    fi
+  done
+fi
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 796ad89ad52..2d09dc8dd5a 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -62,6 +62,7 @@ void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include
 void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);  
 void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim d);
 void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
+void cudaF_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
 void cudaF_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
 void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val, MatrixDim d);
 void cudaF_set_diag(int Gr, int Bl, float* mat, float value, MatrixDim d);
@@ -190,6 +191,7 @@ void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power, bool inclu
 void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);  
 void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim d);
 void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
+void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
 void cudaD_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
 void cudaD_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val, MatrixDim d);
 void cudaD_set_diag(int Gr, int Bl, double* mat, double value, MatrixDim d);
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index d66db927112..d5f4783eb47 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -1259,6 +1259,25 @@ static void _copy_cols(Real* dst, const Real *src, const MatrixIndexT_cuda* reor
   } 
 }
 
+template<typename Real>
+__global__
+static void _add_cols(Real* dst, const Real *src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
+  // Note: in this kernel, the x dimension corresponds to rows and the y to columns,
+  // as it will be going forward.
+
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  if (i < dst_dim.rows && j < dst_dim.cols) {
+    int index = reorder[j],
+        dst_index = i * dst_dim.stride + j;
+    if (index >= 0) {
+      int src_index = i * src_stride + reorder[j];
+      Real val = src[src_index]; 
+      dst[dst_index] += val;
+    }
+  } 
+}
+
 template<typename Real>
 __global__
 static void _copy_rows(Real* dst, const Real *src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
@@ -2024,6 +2043,10 @@ void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const Matri
   _copy_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
 }
 
+void cudaF_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
+  _add_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
+}
+
 void cudaF_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
   _copy_rows<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
 }
@@ -2445,6 +2468,10 @@ void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const Mat
   _copy_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
 }
 
+void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
+  _add_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
+}
+
 void cudaD_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
   _copy_rows<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
 }
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index fd69d46efe0..748eb3a5efe 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -92,6 +92,9 @@ inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val,
 inline void cuda_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
   cudaF_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
 }
+inline void cuda_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
+  cudaF_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+}
 inline void cuda_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
   cudaF_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
 }
@@ -259,6 +262,9 @@ inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val
 inline void cuda_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
   cudaD_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
 }
+inline void cuda_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
+  cudaD_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+}
 inline void cuda_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
   cudaD_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
 }
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index cefb3534899..b4834d6a490 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -509,6 +509,36 @@ static void UnitTestCuMatrixCopyCols() {
 }
 
 
+template<typename Real>
+static void UnitTestCuMatrixAddCols() {
+  for (MatrixIndexT p = 0; p < 2; p++) {
+    MatrixIndexT num_cols1 = 10 + Rand() % 10,
+        num_cols2 = 10 + Rand() % 10,
+        num_rows = 10 + Rand() % 10;
+    CuMatrix<Real> M(num_rows, num_cols1);
+    M.SetRandn();
+    
+    CuMatrix<Real> N(num_rows, num_cols2), O(num_rows, num_cols2);
+    std::vector<int32> reorder(num_cols2);
+    for (int32 i = 0; i < num_cols2; i++)
+      reorder[i] = -1 + (Rand() % (num_cols1 + 1));
+
+    if (Rand() % 2 == 0) {
+      N.AddCols(M, reorder);
+    } else {
+      CuArray<int32> cuda_reorder(reorder);
+      N.AddCols(M, cuda_reorder);
+    }
+    
+    for (int32 i = 0; i < num_rows; i++)
+      for (int32 j = 0; j < num_cols2; j++)
+        if (reorder[j] < 0) O(i, j) = 0;
+        else O(i, j) = M(i, reorder[j]);
+    AssertEqual(N, O);
+  }
+}
+
+
 template<typename Real> 
 static void UnitTestCuMatrixApplyFloor() {
 
@@ -2093,6 +2123,7 @@ template<typename Real> void CudaMatrixUnitTest() {
   UnitTestCuMatrixCopyFromTp<Real>();
   UnitTestCuMatrixAddMatTp<Real>();
   UnitTestCuMatrixCopyCols<Real>();
+  UnitTestCuMatrixAddCols<Real>();
   UnitTestCuMatrixSumColumnRanges<Real>();
   UnitTestCuMatrixCopyRows<Real>();
   UnitTestCuMatrixCopyRowsFromVec<Real>();
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 6aa733dfabd..d8b3bb19462 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -1960,6 +1960,56 @@ void CuMatrixBase<Real>::CopyCols(const CuMatrixBase<Real> &src,
   }
 }
 
+template<typename Real>
+void CuMatrixBase<Real>::AddCols(const CuMatrixBase<Real> &src,
+                                 const std::vector<MatrixIndexT> &reorder) {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    KALDI_ASSERT(static_cast<MatrixIndexT>(reorder.size()) == NumCols());
+    KALDI_ASSERT(NumRows() == src.NumRows());
+#ifdef KALDI_PARANOID
+    MatrixIndexT src_cols = src.NumCols();
+    for (size_t i = 0; i < reorder.size(); i++)
+      KALDI_ASSERT(reorder[i] >= -1 && reorder[i] < src_cols);
+#endif
+    CuArray<MatrixIndexT> cuda_reorder(reorder);
+    
+    Timer tim;
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    // This kernel, as it is newer has the (x,y) dims as (rows,cols).
+    dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK));
+    cuda_add_cols(dimGrid, dimBlock, data_, src.Data(), cuda_reorder.Data(), Dim(), src.Stride());
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    Mat().AddCols(src.Mat(), reorder);
+  }
+}
+
+template<typename Real>
+void CuMatrixBase<Real>::AddCols(const CuMatrixBase<Real> &src,
+                                 const CuArray<MatrixIndexT> &reorder) {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    KALDI_ASSERT(reorder.Dim() == NumCols());
+    KALDI_ASSERT(NumRows() == src.NumRows());
+    Timer tim;
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    // This kernel, as it is newer has the (x,y) dims as (rows,cols).
+    dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK));
+    cuda_add_cols(dimGrid, dimBlock, data_, src.Data(), reorder.Data(), Dim(), src.Stride());
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    std::vector<MatrixIndexT> reorder_cpu;
+    reorder.CopyToVec(&reorder_cpu);
+    Mat().AddCols(src.Mat(), reorder_cpu);
+  }
+}
   
 template<typename Real>
 void CuMatrixBase<Real>::CopyRows(const CuMatrixBase<Real> &src,
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 34f2988fc51..c4fd4c47584 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -98,6 +98,18 @@ class CuMatrixBase {
   void CopyCols(const CuMatrixBase<Real> &src,
                 const CuArray<MatrixIndexT> &indices);
 
+
+  /// Add column indices[r] of src to column r.
+  /// As a special case, if indexes[i] == -1, skip column i
+  /// indices.size() must equal this->NumCols(),
+  /// all elements of "reorder" must be in [-1, src.NumCols()-1],
+  /// and src.NumRows() must equal this.NumRows()
+  void AddCols(const CuMatrixBase<Real> &src,
+               const std::vector<MatrixIndexT> &indices);
+
+  /// Version of CopyCols that takes CuArray argument.
+  void AddCols(const CuMatrixBase<Real> &src,
+               const CuArray<MatrixIndexT> &indices);
   
   /// Copies row r from row indices[r] of src.
   /// As a special case, if indexes[i] <== -1, sets row i to zero  
diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc
index 3b19ff0f057..6449bc6979b 100644
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@@ -2566,6 +2566,34 @@ void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
   }
 }
 
+
+template<typename Real>
+void MatrixBase<Real>::AddCols(const MatrixBase<Real> &src,
+                               const std::vector<MatrixIndexT> &indices) {
+  KALDI_ASSERT(NumRows() == src.NumRows());
+  KALDI_ASSERT(NumCols() == static_cast<MatrixIndexT>(indices.size()));
+  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
+      this_stride = stride_, src_stride = src.stride_;
+  Real *this_data = this->data_;
+  const Real *src_data = src.data_;
+#ifdef KALDI_PARANOID
+  MatrixIndexT src_cols = src.NumCols();
+  for (std::vector<MatrixIndexT>::const_iterator iter = indices.begin();
+       iter != indices.end(); ++iter)
+    KALDI_ASSERT(*iter >= -1 && *iter < src_cols);
+#endif                
+  
+  // For the sake of memory locality we do this row by row, rather
+  // than doing it column-wise using cublas_Xcopy
+  for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride, src_data += src_stride) {
+    const MatrixIndexT *index_ptr = &(indices[0]);
+    for (MatrixIndexT c = 0; c < num_cols; c++, index_ptr++) {
+      if (*index_ptr >= 0)
+	this_data[c] += src_data[*index_ptr];
+    }
+  }
+}
+
 template<typename Real>
 void MatrixBase<Real>::CopyRows(const MatrixBase<Real> &src,
                                 const std::vector<MatrixIndexT> &indices) {
diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
index 77b570d4821..233b315748a 100644
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@@ -284,6 +284,14 @@ class MatrixBase {
   void CopyRows(const MatrixBase<Real> &src,
                 const std::vector<MatrixIndexT> &indices);
   
+  /// Add column indices[r] of src to column r.
+  /// As a special case, if indexes[i] == -1, skip column i
+  /// indices.size() must equal this->NumCols(),
+  /// all elements of "reorder" must be in [-1, src.NumCols()-1],
+  /// and src.NumRows() must equal this.NumRows()
+  void AddCols(const MatrixBase<Real> &src,
+               const std::vector<MatrixIndexT> &indices);
+
   /// Applies floor to all matrix elements
   void ApplyFloor(Real floor_val);
 
diff --git a/src/nnet2/nnet-component-test.cc b/src/nnet2/nnet-component-test.cc
index 94248b242b3..d57450bb0c4 100644
--- a/src/nnet2/nnet-component-test.cc
+++ b/src/nnet2/nnet-component-test.cc
@@ -307,6 +307,31 @@ void UnitTestPnormComponent() {
   }
 }
 
+void UnitTestMaxpoolingComponent() {
+  // works if it has an initializer from int,
+  // e.g. tanh, sigmoid.
+  // We're testing that the gradients are computed correctly:
+  // the input gradients and the model gradients.
+
+  for (int32 i = 0; i < 5; i++) {
+    int32 pool_stride = 5 + Rand() % 10,
+          pool_size = 2 + Rand() % 3,
+          num_pools = 1 + Rand() % 10;
+    int32 output_dim = num_pools * pool_stride;
+    int32 num_patches = num_pools * pool_size;
+    int32 input_dim = pool_stride * num_patches;
+
+    MaxpoolingComponent component(input_dim, output_dim,
+                                  pool_size, pool_stride);
+    UnitTestGenericComponentInternal(component);
+  }
+
+  {
+    MaxpoolingComponent component;
+    component.InitFromString("input-dim=192 output-dim=64 pool-size=3 pool-stride=16");
+    UnitTestGenericComponentInternal(component);
+  }
+}
 
 
 void UnitTestAffineComponent() {
@@ -337,6 +362,44 @@ void UnitTestAffineComponent() {
   }
 }
 
+void UnitTestConvolutional1dComponent() {
+  BaseFloat learning_rate = 0.01,
+            param_stddev = 0.1, bias_stddev = 1.0;
+  int32 patch_stride = 10, patch_step = 1, patch_dim = 4;
+  int32 num_patches = 1 + (patch_stride - patch_dim) / patch_step;
+  int32 num_splice = 5 + Rand() % 10, num_filters = 5 + Rand() % 10;
+  int32 input_dim = patch_stride * num_splice;
+  int32 filter_dim = patch_dim * num_splice;
+  int32 output_dim = num_patches * num_filters;
+  {
+    Convolutional1dComponent component;
+    if (Rand() % 2 == 0) {
+      component.Init(learning_rate, input_dim, output_dim,
+                     patch_dim, patch_step, patch_stride,
+                     param_stddev, bias_stddev);
+    } else {
+      // initialize the hyper-parameters
+      component.Init(learning_rate, input_dim, output_dim,
+                     patch_dim, patch_step, patch_stride,
+                     param_stddev, bias_stddev);
+      Matrix<BaseFloat> mat(num_filters, filter_dim + 1);
+      mat.SetRandn();
+      mat.Scale(param_stddev);
+      WriteKaldiObject(mat, "tmpf", true);
+      Sleep(0.5);
+      component.Init(learning_rate, "tmpf");
+      unlink("tmpf");
+    }
+    UnitTestGenericComponentInternal(component);
+  }
+  {
+    const char *str = "learning-rate=0.01 input-dim=100 output-dim=70 param-stddev=0.1 patch-dim=4 patch-step=1 patch-stride=10";
+    Convolutional1dComponent component;
+    component.InitFromString(str);
+    UnitTestGenericComponentInternal(component);
+  }
+}
+
 void UnitTestDropoutComponent() {
   // We're testing that the gradients are computed correctly:
   // the input gradients and the model gradients.
@@ -812,6 +875,7 @@ int main() {
       UnitTestSpliceComponent();
       UnitTestMaxoutComponent(); 
       UnitTestPnormComponent(); 
+      UnitTestMaxpoolingComponent();
       UnitTestGenericComponent<NormalizeComponent>();
       UnitTestSigmoidComponent();
       UnitTestAffineComponent();
@@ -826,6 +890,7 @@ int main() {
       UnitTestFixedBiasComponent();
       UnitTestAffineComponentPreconditioned();
       UnitTestAffineComponentPreconditionedOnline();
+      UnitTestConvolutional1dComponent();
       UnitTestDropoutComponent();
       UnitTestAdditiveNoiseComponent();
       UnitTestParsing();
diff --git a/src/nnet2/nnet-component.cc b/src/nnet2/nnet-component.cc
index b788d40d5dc..1b8224b797a 100644
--- a/src/nnet2/nnet-component.cc
+++ b/src/nnet2/nnet-component.cc
@@ -102,6 +102,10 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new DropoutComponent();
   } else if (component_type == "AdditiveNoiseComponent") {
     ans = new AdditiveNoiseComponent();
+  } else if (component_type == "Convolutional1dComponent") {
+    ans = new Convolutional1dComponent();
+  } else if (component_type == "MaxpoolingComponent") {
+    ans = new MaxpoolingComponent();
   }
   return ans;
 }
@@ -3672,5 +3676,641 @@ void AdditiveNoiseComponent::Propagate(const ChunkInfo &in_info,
   out->AddMat(stddev_, rand);
 }
 
+Convolutional1dComponent::Convolutional1dComponent():
+    UpdatableComponent(),
+    patch_dim_(0), patch_step_(0), patch_stride_(0), is_gradient_(false) {}
+
+Convolutional1dComponent::Convolutional1dComponent(const Convolutional1dComponent &component):
+    UpdatableComponent(component),
+    filter_params_(component.filter_params_),
+    bias_params_(component.bias_params_),
+    is_gradient_(component.is_gradient_) {}
+
+Convolutional1dComponent::Convolutional1dComponent(const CuMatrixBase<BaseFloat> &filter_params,
+                                                   const CuVectorBase<BaseFloat> &bias_params,
+                                                   BaseFloat learning_rate):
+    UpdatableComponent(learning_rate),
+    filter_params_(filter_params),
+    bias_params_(bias_params) {
+  KALDI_ASSERT(filter_params.NumRows() == bias_params.Dim() &&
+               bias_params.Dim() != 0);
+  is_gradient_ = false;
+}
+
+// aquire input dim
+int32 Convolutional1dComponent::InputDim() const {
+  int32 filter_dim = filter_params_.NumCols();
+  int32 num_splice = filter_dim / patch_dim_;
+  return patch_stride_ * num_splice;
+}
+
+// aquire output dim
+int32 Convolutional1dComponent::OutputDim() const {
+  int32 num_filters = filter_params_.NumRows();
+  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
+  return num_patches * num_filters;
+}
+
+// initialize the component using hyperparameters
+void Convolutional1dComponent::Init(BaseFloat learning_rate,
+                                    int32 input_dim, int32 output_dim,
+                                    int32 patch_dim, int32 patch_step, int32 patch_stride,
+                                    BaseFloat param_stddev, BaseFloat bias_stddev) {
+  UpdatableComponent::Init(learning_rate);
+  patch_dim_ = patch_dim;
+  patch_step_ = patch_step;
+  patch_stride_ = patch_stride;
+  int32 num_splice = input_dim / patch_stride;
+  int32 filter_dim = num_splice * patch_dim;
+  int32 num_patches = 1 + (patch_stride - patch_dim) / patch_step;
+  int32 num_filters = output_dim / num_patches;
+  KALDI_ASSERT(input_dim % patch_stride == 0);
+  KALDI_ASSERT((patch_stride - patch_dim) % patch_step == 0);
+  KALDI_ASSERT(output_dim % num_patches == 0);
+
+  filter_params_.Resize(num_filters, filter_dim);
+  bias_params_.Resize(num_filters);
+  KALDI_ASSERT(param_stddev >= 0.0 && bias_stddev >= 0.0);
+  filter_params_.SetRandn();
+  filter_params_.Scale(param_stddev);
+  bias_params_.SetRandn();
+  bias_params_.Scale(bias_stddev);
+}
+
+// initialize the component using predefined matrix file
+void Convolutional1dComponent::Init(BaseFloat learning_rate,
+                                    std::string matrix_filename) {
+  UpdatableComponent::Init(learning_rate);
+  CuMatrix<BaseFloat> mat;
+  ReadKaldiObject(matrix_filename, &mat);
+  KALDI_ASSERT(mat.NumCols() >= 2);
+  int32 filter_dim = mat.NumCols() - 1, num_filters = mat.NumRows();
+  filter_params_.Resize(num_filters, filter_dim);
+  bias_params_.Resize(num_filters);
+  filter_params_.CopyFromMat(mat.Range(0, num_filters, 0, filter_dim));
+  bias_params_.CopyColFromMat(mat, filter_dim);
+}
+
+// resize the component, setting the parameters to zero, while
+// leaving any other configuration values the same
+void Convolutional1dComponent::Resize(int32 input_dim, int32 output_dim) {
+  KALDI_ASSERT(input_dim > 0 && output_dim > 0);
+  int32 num_splice = input_dim / patch_stride_;
+  int32 filter_dim = num_splice * patch_dim_;
+  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
+  int32 num_filters = output_dim / num_patches;
+  KALDI_ASSERT(input_dim % patch_stride_ == 0);
+  KALDI_ASSERT((patch_stride_ - patch_dim_) % patch_step_ == 0);
+  KALDI_ASSERT(output_dim % num_patches == 0);
+  filter_params_.Resize(num_filters, filter_dim);
+  bias_params_.Resize(num_filters);
+}
+
+// display information about component
+std::string Convolutional1dComponent::Info() const {
+  std::stringstream stream;
+  BaseFloat filter_params_size = static_cast<BaseFloat>(filter_params_.NumRows()) 
+                                 * static_cast<BaseFloat>(filter_params_.NumCols());
+  BaseFloat filter_stddev = 
+            std::sqrt(TraceMatMat(filter_params_, filter_params_, kTrans) /
+                      filter_params_size),
+            bias_stddev = std::sqrt(VecVec(bias_params_, bias_params_) /
+                                    bias_params_.Dim());
+
+  int32 num_splice = InputDim() / patch_stride_;
+  int32 filter_dim = num_splice * patch_dim_;
+  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
+  int32 num_filters = OutputDim() / num_patches;
+
+  stream << Type() << ", input-dim=" << InputDim()
+         << ", output-dim=" << OutputDim()
+         << ", num-splice=" << num_splice
+         << ", num-patches=" << num_patches
+         << ", num-filters=" << num_filters
+         << ", filter-dim=" << filter_dim
+         << ", filter-params-stddev=" << filter_stddev
+         << ", bias-params-stddev=" << bias_stddev
+         << ", learning-rate=" << LearningRate();
+  return stream.str();
+}
+
+// initialize the component using configuration file
+void Convolutional1dComponent::InitFromString(std::string args) {
+  std::string orig_args(args);
+  bool ok = true;
+  BaseFloat learning_rate = learning_rate_;
+  std::string matrix_filename;
+  int32 input_dim = -1, output_dim = -1;
+  int32 patch_dim = -1, patch_step = -1, patch_stride = -1;
+  ParseFromString("learning-rate", &args, &learning_rate);
+  if (ParseFromString("matrix", &args, &matrix_filename)) {
+    // initialize from prefined parameter matrix
+    Init(learning_rate, matrix_filename);
+    if (ParseFromString("input-dim", &args, &input_dim))
+      KALDI_ASSERT(input_dim == InputDim() &&
+               "input-dim mismatch vs. matrix.");
+    if (ParseFromString("output-dim", &args, &output_dim))
+            KALDI_ASSERT(output_dim == OutputDim() &&
+                     "output-dim mismatch vs. matrix.");
+  } else {
+    // initialize from configuration
+    ok = ok && ParseFromString("input-dim", &args, &input_dim);
+    ok = ok && ParseFromString("output-dim", &args, &output_dim);
+    ok = ok && ParseFromString("patch-dim", &args, &patch_dim);
+    ok = ok && ParseFromString("patch-step", &args, &patch_step);
+    ok = ok && ParseFromString("patch-stride", &args, &patch_stride);
+    BaseFloat param_stddev = 1.0 / std::sqrt(input_dim), bias_stddev = 1.0;
+    ParseFromString("param-stddev", &args, &param_stddev);
+    ParseFromString("bias-stddev", &args, &bias_stddev);
+    Init(learning_rate, input_dim, output_dim,
+         patch_dim, patch_step, patch_stride, param_stddev, bias_stddev);
+  }
+  if (!args.empty())
+    KALDI_ERR << "Could not process these elements in initializer: " << args;
+  if (!ok)
+    KALDI_ERR << "Bad initializer " << orig_args;
+}
+
+// propagation function
+
+/* Convolutional propagation is explained:
+ - Recall the AffineComponent, input X is defined #frames x $input-dim,
+   linear matrix A is defined $output-dim x $input-dim, and bias
+   vector B is defined by length $output-dim. The propagation is
+   Y = X * A' + B                                     (1)
+   where "*" is row-by-row processing of X, executing vector-matrix
+   multiplication 
+   Y(t) = X(t) * A' + B                               (2)
+   which converts each row of input of dim $input-dim to a row of output of
+   dim $output-dim by A' (' defines transpose).
+ - In Convolution1dComponent, A is redefined $num-filters x $filter-dim,
+   and bias vector B is redefined by length $num-filters. The propatation is
+   Y = X o A' + B                                     (3)
+   where "o" is also row-by-row processing of X, but executing vector-matrix
+   convolution, which consists of a group of vector-vector convolutions.
+   For instance, the convolution of X(t) and the i-th filter A(i) is
+   Y(t,i) = X(t) o A'(i) + B(i)                       (4)
+   The convolution used here is valid convolution. Meaning that the
+   output of M o N is of dim |M| - |N| + 1, assuming M is not shorter then N.
+
+   Note that in all the equations, B is extended to proper dimensions
+   for legal addition.
+*/
+void Convolutional1dComponent::Propagate(const ChunkInfo &in_info,
+                                         const ChunkInfo &out_info,
+                                         const CuMatrixBase<BaseFloat> &in,
+                                         CuMatrixBase<BaseFloat> *out) const {
+  in_info.CheckSize(in);
+  out_info.CheckSize(*out);
+  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
+
+  // dims
+  int32 num_splice = InputDim() / patch_stride_;
+  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
+  int32 num_filters = filter_params_.NumRows();
+  int32 num_frames = in.NumRows();
+  int32 filter_dim = filter_params_.NumCols();
+
+  /** Buffer of reshaped inputs:
+   *  1row = vectorized rectangular feature patches
+   *  1col = dim over speech frames,
+   */
+  CuMatrix<BaseFloat> patches(num_frames, filter_dim * num_patches, kUndefined);
+  // column_map is indexed by the column-index of "patches",
+  // and the value is the corresponding column-index of "in". 
+  std::vector<int32> column_map;
+
+  // build-up a column selection map
+  for (int32 p = 0, index = 0; p < num_patches; p++) {
+    for (int32 s = 0; s < num_splice; s++) {
+        for (int32 d = 0; d < patch_dim_; d++, index++) {
+        column_map[index] = p * patch_step_ + s * patch_stride_ + d;
+      }
+    }
+  }
+  patches.CopyCols(in, column_map);
+
+  // compute filter activations
+  for (int32 p = 0; p < num_patches; p++) {
+    CuSubMatrix<BaseFloat> tgt(out->ColRange(p * num_filters, num_filters));
+    CuSubMatrix<BaseFloat> patch(patches.ColRange(p * filter_dim, filter_dim));
+    tgt.AddVecToRows(1.0, bias_params_, 0.0); // add bias
+    // apply all filters
+    tgt.AddMatMat(1.0, patch, kNoTrans, filter_params_, kTrans, 1.0);
+  }
+}
+
+// scale the parameters
+void Convolutional1dComponent::Scale(BaseFloat scale) {
+  filter_params_.Scale(scale);
+  bias_params_.Scale(scale);
+}
+
+// add another convolution component
+void Convolutional1dComponent::Add(BaseFloat alpha, const UpdatableComponent &other_in) {
+  const Convolutional1dComponent *other =
+      dynamic_cast<const Convolutional1dComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  filter_params_.AddMat(alpha, other->filter_params_);
+  bias_params_.AddVec(alpha, other->bias_params_);
+}
+
+/*
+ This function does an operation similar to reversing a map,
+ except it handles maps that are not one-to-one by outputting
+ the reversed map as a vector of lists.
+ @param[in] forward_indexes is a vector of int32, each of whose
+            elements is between 0 and input_dim - 1.
+ @param[in] input_dim. See definitions of forward_indexes and
+            backward_indexes.
+ @param[out] backward_indexes is a vector of dimension input_dim
+            of lists, The list at (backward_indexes[i]) is a list
+            of all indexes j such that forward_indexes[j] = i.
+*/
+void Convolutional1dComponent::ReverseIndexes(const std::vector<int32> &forward_indexes,
+                                              int32 input_dim,
+                                              std::vector<std::vector<int32> > *backward_indexes) {
+  int32 i, size = forward_indexes.size();
+  int32 reserve_size = 2 + size / input_dim;
+  backward_indexes->resize(input_dim);
+  std::vector<std::vector<int32> >::iterator iter = backward_indexes->begin(),
+    end = backward_indexes->end();
+  for (; iter != end; ++iter)
+    iter->reserve(reserve_size);
+  for (int32 j = 0; j < forward_indexes.size(); j++) {
+    i = forward_indexes[j];
+    KALDI_ASSERT(i < input_dim);
+    (*backward_indexes)[i].push_back(j);
+  }
+}
+
+/*
+ This function transforms a vector of lists into a list of vectors,
+ padded with -1.
+ @param[in] The input vector of lists. Let in.size() be D, and let
+            the longest list length (i.e. the max of in[i].size()) be L.
+ @param[out] The output list of vectors. The length of the list will
+            be L, each vector-dimension will be D (i.e. out[i].size() == D),
+            and if in[i] == j, then for some k we will have that
+            out[k][j] = i. The output vectors are padded with -1
+            where necessary if not all the input lists have the same side.
+*/
+void Convolutional1dComponent::RearrangeIndexes(const std::vector<std::vector<int32> > &in,
+                                                std::vector<std::vector<int32> > *out) {
+  int32 D = in.size();
+  int32 L = 0;
+  for (int32 i = 0; i < D; i++)
+    if (in[i].size() > L)
+      L = in[i].size();
+  out->resize(L);
+  for (int32 i = 0; i < L; i++)
+    (*out)[i].resize(D, -1);
+  for (int32 i = 0; i < D; i++) {
+    for (int32 j = 0; j < in[i].size(); j++) {
+      (*out)[j][i] = in[i][j];
+    }
+  }
+}
+
+// back propagation function
+void Convolutional1dComponent::Backprop(const ChunkInfo &in_info,
+                                        const ChunkInfo &out_info,
+                                        const CuMatrixBase<BaseFloat> &in_value,
+                                        const CuMatrixBase<BaseFloat> &out_value,
+                                        const CuMatrixBase<BaseFloat> &out_deriv,
+                                        Component *to_update_in,
+                                        CuMatrix<BaseFloat> *in_deriv) const {
+  in_deriv->Resize(out_deriv.NumRows(), InputDim());
+  Convolutional1dComponent *to_update = dynamic_cast<Convolutional1dComponent*>(to_update_in);
+  int32 num_splice = InputDim() / patch_stride_;
+  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
+  int32 num_filters = filter_params_.NumRows();
+  int32 num_frames = out_deriv.NumRows();
+  int32 filter_dim = filter_params_.NumCols();
+
+  /** Buffer for backpropagation:
+   *  derivatives in the domain of 'patches_',
+   *  1row = vectorized rectangular feature patches,
+   *  1col = dim over speech frames,
+   */
+  CuMatrix<BaseFloat> patches_deriv(num_frames, filter_dim * num_patches, kSetZero);
+
+  // backpropagate to vector of matrices
+  // (corresponding to position of a filter)
+  for (int32 p = 0; p < num_patches; p++) {
+    CuSubMatrix<BaseFloat> patch_deriv(patches_deriv.ColRange(p * filter_dim, filter_dim));
+    CuSubMatrix<BaseFloat> out_deriv_patch(out_deriv.ColRange(p * num_filters,
+                                                              num_filters));
+    patch_deriv.AddMatMat(1.0, out_deriv_patch, kNoTrans,
+                          filter_params_, kNoTrans, 0.0);
+  }
+
+  // sum the derivatives into in_deriv
+  std::vector<int32> column_map(filter_dim * num_patches);
+  for (int32 p = 0, index = 0; p < num_patches; p++) {
+    for (int32 s = 0; s < num_splice; s++) {
+      for (int32 d = 0; d < patch_dim_; d++, index++) {
+        column_map[index] = p * patch_step_ + s * patch_stride_ + d;
+      }
+    }
+  }
+  std::vector<std::vector<int32> > reversed_column_map;
+  ReverseIndexes(column_map, InputDim(), &reversed_column_map);
+  std::vector<std::vector<int32> > rearranged_column_map;
+  RearrangeIndexes(reversed_column_map, &rearranged_column_map);
+  for (int32 p = 0; p < rearranged_column_map.size(); p++) {
+    in_deriv->AddCols(patches_deriv, rearranged_column_map[p]);
+  }
+
+  if (to_update != NULL) {
+    // Next update the model (must do this 2nd so the derivatives we propagate
+    // are accurate, in case this == to_update_in.)
+    to_update->Update(in_value, out_deriv);
+  }
+}
+
+void Convolutional1dComponent::SetZero(bool treat_as_gradient) {
+  if (treat_as_gradient) {
+    SetLearningRate(1.0);
+  }
+  filter_params_.SetZero();
+  bias_params_.SetZero();
+  if (treat_as_gradient) {
+    is_gradient_ = true;
+  }
+}
+
+void Convolutional1dComponent::Read(std::istream &is, bool binary) {
+  std::ostringstream ostr_beg, ostr_end;
+  ostr_beg << "<" << Type() << ">"; // e.g. "<Convolutional1dComponent>"
+  ostr_end << "</" << Type() << ">"; // e.g. "</Convolutional1dComponent>"
+  // might not see the "<Convolutional1dComponent>" part because
+  // of how ReadNew() works.
+  ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<LearningRate>");
+  ReadBasicType(is, binary, &learning_rate_);
+  ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<PatchDim>");
+  ReadBasicType(is, binary, &patch_dim_);
+  ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<PatchStep>");
+  ReadBasicType(is, binary, &patch_step_);
+  ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<PatchStride>");
+  ReadBasicType(is, binary, &patch_stride_);
+  ExpectToken(is, binary, "<FilterParams>");
+  filter_params_.Read(is, binary);
+  ExpectToken(is, binary, "<BiasParams>");
+  bias_params_.Read(is, binary);
+  std::string tok;
+  ReadToken(is, binary, &tok);
+  if (tok == "<IsGradient>") {
+    ReadBasicType(is, binary, &is_gradient_);
+    ExpectToken(is, binary, ostr_end.str());
+  } else {
+    is_gradient_ = false;
+    KALDI_ASSERT(tok == ostr_end.str());
+  }
+}
+
+void Convolutional1dComponent::Write(std::ostream &os, bool binary) const {
+  std::ostringstream ostr_beg, ostr_end;
+  ostr_beg << "<" << Type() << ">"; // e.g. "<Convolutional1dComponent>"
+  ostr_end << "</" << Type() << ">"; // e.g. "</Convolutional1dComponent>"
+  WriteToken(os, binary, ostr_beg.str());
+  WriteToken(os, binary, "<LearningRate>");
+  WriteBasicType(os, binary, learning_rate_);
+  WriteToken(os, binary, "<PatchDim>");
+  WriteBasicType(os, binary, patch_dim_);
+  WriteToken(os, binary, "<PatchStep>");
+  WriteBasicType(os, binary, patch_step_);
+  WriteToken(os, binary, "<PatchStride>");
+  WriteBasicType(os, binary, patch_stride_);
+  WriteToken(os, binary, "<FilterParams>");
+  filter_params_.Write(os, binary);
+  WriteToken(os, binary, "<BiasParams>");
+  bias_params_.Write(os, binary);
+  WriteToken(os, binary, "<IsGradient>");
+  WriteBasicType(os, binary, is_gradient_);
+  WriteToken(os, binary, ostr_end.str());
+}
+
+BaseFloat Convolutional1dComponent::DotProduct(const UpdatableComponent &other_in) const {
+  const Convolutional1dComponent *other =
+      dynamic_cast<const Convolutional1dComponent*>(&other_in);
+  return TraceMatMat(filter_params_, other->filter_params_, kTrans)
+         + VecVec(bias_params_, other->bias_params_);
+}
+
+Component* Convolutional1dComponent::Copy() const {
+  Convolutional1dComponent *ans = new Convolutional1dComponent();
+  ans->learning_rate_ = learning_rate_;
+  ans->patch_dim_ = patch_dim_;
+  ans->patch_step_ = patch_step_;
+  ans->patch_stride_ = patch_stride_;
+  ans->filter_params_ = filter_params_;
+  ans->bias_params_ = bias_params_;
+  ans->is_gradient_ = is_gradient_;
+  return ans;
+}
+
+void Convolutional1dComponent::PerturbParams(BaseFloat stddev) {
+  CuMatrix<BaseFloat> temp_filter_params(filter_params_);
+  temp_filter_params.SetRandn();
+  filter_params_.AddMat(stddev, temp_filter_params);
+
+  CuVector<BaseFloat> temp_bias_params(bias_params_);
+  temp_bias_params.SetRandn();
+  bias_params_.AddVec(stddev, temp_bias_params);
+}
+
+void Convolutional1dComponent::SetParams(const VectorBase<BaseFloat> &bias,
+                                         const MatrixBase<BaseFloat> &filter) {
+  bias_params_ = bias;
+  filter_params_ = filter;
+  KALDI_ASSERT(bias_params_.Dim() == filter_params_.NumRows());
+}
+
+int32 Convolutional1dComponent::GetParameterDim() const {
+  return (filter_params_.NumCols() + 1) * filter_params_.NumRows();
+}
+
+// update parameters
+void Convolutional1dComponent::Update(const CuMatrixBase<BaseFloat> &in_value,
+                                      const CuMatrixBase<BaseFloat> &out_deriv) {
+  // useful dims
+  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
+  int32 num_filters = filter_params_.NumRows();
+  int32 filter_dim = filter_params_.NumCols();
+  int32 num_frames = in_value.NumRows();
+  int32 num_splice = InputDim() / patch_stride_;
+  CuMatrix<BaseFloat> filters_grad;
+  CuVector<BaseFloat> bias_grad;
+
+  /** Buffer of reshaped inputs:
+   *  1row = vectorized rectangular feature patches
+   *  1col = dim over speech frames,
+   */
+  CuMatrix<BaseFloat> patches(num_frames, filter_dim * num_patches, kUndefined);
+  std::vector<int32> column_map(filter_dim * num_patches);
+  for (int32 p = 0, index = 0; p < num_patches; p++) {
+    for (int32 s = 0; s < num_splice; s++) {
+      for (int32 d = 0; d < patch_dim_; d++, index++) {
+        column_map[index] = p * patch_step_ + s * patch_stride_ + d;
+      }
+    }
+  }
+  patches.CopyCols(in_value, column_map);
+
+  //
+  // calculate the gradient
+  //
+  filters_grad.Resize(num_filters, filter_dim, kSetZero); // reset
+  bias_grad.Resize(num_filters, kSetZero); // reset
+  // use all the patches
+  for (int32 p = 0; p < num_patches; p++) { // sum
+    CuSubMatrix<BaseFloat> diff_patch(out_deriv.ColRange(p * num_filters,
+                                                         num_filters));
+    CuSubMatrix<BaseFloat> patch(patches.ColRange(p * filter_dim, filter_dim));
+    filters_grad.AddMatMat(1.0, diff_patch, kTrans, patch, kNoTrans, 1.0);
+    bias_grad.AddRowSumMat(1.0, diff_patch, 1.0);
+  }
+
+  //
+  // update
+  //
+  filter_params_.AddMat(learning_rate_, filters_grad);
+  bias_params_.AddVec(learning_rate_, bias_grad);
+}
+
+void MaxpoolingComponent::Init(int32 input_dim, int32 output_dim,
+                               int32 pool_size, int32 pool_stride)  {
+  input_dim_ = input_dim;
+  output_dim_ = output_dim;
+  pool_size_ = pool_size;
+  pool_stride_ = pool_stride;
+
+  // sanity check
+  // number of patches
+  KALDI_ASSERT(input_dim_ % pool_stride_ == 0);
+  int32 num_patches = input_dim_ / pool_stride_;
+  // number of pools
+  KALDI_ASSERT(num_patches % pool_size_ == 0);
+  int32 num_pools = num_patches / pool_size_;
+  // check output dim
+  KALDI_ASSERT(output_dim_ == num_pools * pool_stride_);
+}
+
+void MaxpoolingComponent::InitFromString(std::string args) {
+  std::string orig_args(args);
+  int32 input_dim = 0;
+  int32 output_dim = 0;
+  int32 pool_size = -1, pool_stride = -1;
+  bool ok = true;
+
+  ok = ok && ParseFromString("input-dim", &args, &input_dim);
+  ok = ok && ParseFromString("output-dim", &args, &output_dim);
+  ok = ok && ParseFromString("pool-size", &args, &pool_size);
+  ok = ok && ParseFromString("pool-stride", &args, &pool_stride);
+
+  KALDI_LOG << output_dim << " " << input_dim << " " << ok;
+  KALDI_LOG << "Pool: " << pool_size << " "
+            << pool_stride << " " << ok;
+  if (!ok || !args.empty() || output_dim <= 0)
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << orig_args << "\"";
+  Init(input_dim, output_dim, pool_size, pool_stride);
+}
+
+void MaxpoolingComponent::Propagate(const ChunkInfo &in_info,
+                                    const ChunkInfo &out_info,
+                                    const CuMatrixBase<BaseFloat> &in,
+                                    CuMatrixBase<BaseFloat> *out) const  {
+  in_info.CheckSize(in);
+  out_info.CheckSize(*out);
+  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
+  int32 num_patches = input_dim_ / pool_stride_;
+  int32 num_pools = num_patches / pool_size_;
+
+  // do the max-pooling
+  for (int32 q = 0; q < num_pools; q++) {
+    // get output buffer of the pool
+    CuSubMatrix<BaseFloat> pool(out->ColRange(q * pool_stride_, pool_stride_));
+    pool.Set(-1e20); // reset a large negative value
+    for (int32 r = 0; r < pool_size_; r++) {
+      // col-by-col block comparison pool
+      int32 p = r + q * pool_size_;
+      pool.Max(in.ColRange(p * pool_stride_, pool_stride_));
+    }
+  }
+}
+
+void MaxpoolingComponent::Backprop(const ChunkInfo &, // in_info,
+                                   const ChunkInfo &, // out_info,
+                                   const CuMatrixBase<BaseFloat> &in_value,
+                                   const CuMatrixBase<BaseFloat> &out_value,
+                                   const CuMatrixBase<BaseFloat> &out_deriv,
+                                   Component *to_update,
+                                   CuMatrix<BaseFloat> *in_deriv) const {
+  int32 num_patches = input_dim_ / pool_stride_;
+  int32 num_pools = num_patches / pool_size_;
+  std::vector<int32> patch_summands(num_patches, 0);
+  in_deriv->Resize(in_value.NumRows(), in_value.NumCols(), kSetZero);
+
+  for(int32 q = 0; q < num_pools; q++) {
+    for(int32 r = 0; r < pool_size_; r++) {
+      int32 p = r + q * pool_size_;
+      CuSubMatrix<BaseFloat> in_p(in_value.ColRange(p * pool_stride_, pool_stride_));
+      CuSubMatrix<BaseFloat> out_q(out_value.ColRange(q * pool_stride_, pool_stride_));
+      CuSubMatrix<BaseFloat> tgt(in_deriv->ColRange(p * pool_stride_, pool_stride_));
+      CuMatrix<BaseFloat> src(out_deriv.ColRange(q * pool_stride_, pool_stride_));
+      // zero-out mask
+      CuMatrix<BaseFloat> mask;
+      in_p.EqualElementMask(out_q, &mask);
+      src.MulElements(mask);
+      tgt.AddMat(1.0, src);
+      // summed deriv info
+      patch_summands[p] += 1;
+    }
+  }
+
+  // scale in_deriv of overlaped pools
+  for(int32 p = 0; p < num_patches; p++) {
+    CuSubMatrix<BaseFloat> tgt(in_deriv->ColRange(p * pool_stride_, pool_stride_));
+    KALDI_ASSERT(patch_summands[p] > 0);
+    tgt.Scale(1.0 / patch_summands[p]);
+  }
+}
+
+void MaxpoolingComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<MaxpoolingComponent>", "<InputDim>");
+  ReadBasicType(is, binary, &input_dim_);
+  ExpectToken(is, binary, "<OutputDim>");
+  ReadBasicType(is, binary, &output_dim_);
+  ExpectToken(is, binary, "<PoolSize>");
+  ReadBasicType(is, binary, &pool_size_);
+  ExpectToken(is, binary, "<PoolStride>");
+  ReadBasicType(is, binary, &pool_stride_);
+  ExpectToken(is, binary, "</MaxpoolingComponent>");
+}
+
+void MaxpoolingComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<MaxpoolingComponent>");
+  WriteToken(os, binary, "<InputDim>");
+  WriteBasicType(os, binary, input_dim_);
+  WriteToken(os, binary, "<OutputDim>");
+  WriteBasicType(os, binary, output_dim_);
+  WriteToken(os, binary, "<PoolSize>");
+  WriteBasicType(os, binary, pool_size_);
+  WriteToken(os, binary, "<PoolStride>");
+  WriteBasicType(os, binary, pool_stride_);
+  WriteToken(os, binary, "</MaxpoolingComponent>");
+}
+
+std::string MaxpoolingComponent::Info() const {
+  std::stringstream stream;
+  stream << Type() << ", input-dim = " << input_dim_
+         << ", output-dim = " << output_dim_
+         << ", pool-size = " << pool_size_
+         << ", pool-stride = " << pool_stride_;
+  return stream.str();
+}
+
 } // namespace nnet2
 } // namespace kaldi
diff --git a/src/nnet2/nnet-component.h b/src/nnet2/nnet-component.h
index 44a19d28b2d..1c970fecdd0 100644
--- a/src/nnet2/nnet-component.h
+++ b/src/nnet2/nnet-component.h
@@ -448,6 +448,69 @@ class MaxoutComponent: public Component {
   int32 output_dim_;
 };
 
+/**
+ * MaxPoolingComponent :
+ * Maxpooling component was firstly used in ConvNet for selecting an representative
+ * activation in an area. It inspired Maxout nonlinearity.
+ *
+ * The input/output matrices are split to submatrices with width 'pool_stride_'.
+ * For instance, a minibatch of 512 frames is propagated by a convolutional
+ * layer, resulting in a 512 x 3840 input matrix for MaxpoolingComponent,
+ * which is composed of 128 feature maps for each frame (128 x 30). If you want
+ * a 3-to-1 maxpooling on each feature map, set 'pool_stride_' and 'pool_size_'
+ * as 128 and 3 respectively. Maxpooling component would create an output
+ * matrix of 512 x 1280. The 30 input neurons are grouped by a group size of 3, and
+ * the maximum in a group is selected, creating a smaller feature map of 10.
+ * 
+ * Our pooling does not supports overlaps, which simplifies the
+ * implementation (and was not helpful for Ossama).
+ */
+class MaxpoolingComponent: public Component {
+ public:
+  void Init(int32 input_dim, int32 output_dim,
+            int32 pool_size, int32 pool_stride);
+  explicit MaxpoolingComponent(int32 input_dim, int32 output_dim,
+                               int32 pool_size, int32 pool_stride) {
+    Init(input_dim, output_dim, pool_size, pool_stride);
+  }
+  MaxpoolingComponent(): input_dim_(0), output_dim_(0),
+    pool_size_(0), pool_stride_(0) { }
+  virtual std::string Type() const { return "MaxpoolingComponent"; }
+  virtual void InitFromString(std::string args);
+  virtual int32 InputDim() const { return input_dim_; }
+  virtual int32 OutputDim() const { return output_dim_; }
+  using Component::Propagate; // to avoid name hiding
+  virtual void Propagate(const ChunkInfo &in_info,
+                         const ChunkInfo &out_info,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const ChunkInfo &in_info,
+                        const ChunkInfo &out_info,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &,  //out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *to_update, // may be identical to "this".
+                        CuMatrix<BaseFloat> *in_deriv) const;
+  virtual bool BackpropNeedsInput() const { return true; }
+  virtual bool BackpropNeedsOutput() const { return true; }
+  virtual Component* Copy() const {
+    return new MaxpoolingComponent(input_dim_, output_dim_,
+                               pool_size_, pool_stride_); }
+
+  virtual void Read(std::istream &is, bool binary); // This Read function
+  // requires that the Component has the correct type.
+
+  /// Write component to stream
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual std::string Info() const;
+ protected:
+  int32 input_dim_;
+  int32 output_dim_;
+  int32 pool_size_;
+  int32 pool_stride_;
+};
+
 class PnormComponent: public Component {
  public:
   void Init(int32 input_dim, int32 output_dim, BaseFloat p);
@@ -1613,6 +1676,122 @@ class AdditiveNoiseComponent: public RandomComponent {
   BaseFloat stddev_;
 };
 
+/**
+ * Convolutional1dComponent implements convolution over frequency axis.
+ * We assume the input featrues are spliced, i.e. each frame is in
+ * fact a set of stacked frames, where we can form patches which span
+ * over several frequency bands and whole time axis. A patch is the
+ * instance of a filter on a group of frequency bands and whole time
+ * axis. Shifts of the filter generate patches.
+ *
+ * The convolution is done over whole axis with same filter
+ * coefficients, i.e. we don't use separate filters for different
+ * 'regions' of frequency axis. Due to convolution, same weights are
+ * used repeateadly, the final gradient is a sum of all
+ * position-specific gradients (the sum was found better than
+ * averaging).
+ *
+ * In order to have a fast implementations, the filters are
+ * represented in vectorized form, where each rectangular filter
+ * corresponds to a row in a matrix, where all the filters are
+ * stored. The features are then re-shaped to a set of matrices, where
+ * one matrix corresponds to single patch-position, where all the
+ * filters get applied.
+ * 
+ * The type of convolution is controled by hyperparameters:
+ * patch_dim_     ... frequency axis size of the patch
+ * patch_step_    ... size of shift in the convolution
+ * patch_stride_  ... shift for 2nd dim of a patch 
+ *                    (i.e. frame length before splicing)
+ * For instance, for a convolutional component after raw input,
+ * if the input is 36-dim fbank feature with delta of order 2
+ * and spliced using +/- 5 frames of contexts, the convolutional
+ * component takes the input as a 36 x 33 image. The patch_stride_
+ * should be configured 36. If patch_step_ and patch_dim_ are
+ * configured 1 and 7, the Convolutional1dComponent creates a
+ * 2D filter of 7 x 33, such that the convolution is actually done
+ * only along the frequency axis. Specifically, the convolutional
+ * output along the frequency axis is (36 - 7) / 1 + 1 = 30, and
+ * the convolutional output along the temporal axis is 33 - 33 + 1 = 1,
+ * resulting in an output image of 30 x 1, which is called a feature map
+ * in ConvNet. Then if the output-dim is set 3840, the constructor
+ * would know there should be 3840 / 30 = 128 distinct filters,
+ * which will create 128 feature maps of 30 x 1 for one frame of
+ * input. The feature maps are vectorized as a 3840-dim row vector
+ * in the output matrix of this component. For details on progatation
+ * of Convolutional1dComponent, check the function definition.
+ *
+ */
+class Convolutional1dComponent: public UpdatableComponent {
+ public:
+  Convolutional1dComponent();
+  // constructor using another component
+  Convolutional1dComponent(const Convolutional1dComponent &component);
+  // constructor using parameters
+  Convolutional1dComponent(const CuMatrixBase<BaseFloat> &filter_params,
+                           const CuVectorBase<BaseFloat> &bias_params,
+                           BaseFloat learning_rate);
+
+  int32 InputDim() const;
+  int32 OutputDim() const;
+  void Init(BaseFloat learning_rate, int32 input_dim, int32 output_dim,
+            int32 patch_dim, int32 patch_step, int32 patch_stride,
+            BaseFloat param_stddev, BaseFloat bias_stddev);
+  void Init(BaseFloat learning_rate, std::string matrix_filename);
+
+  // resize the component, setting the parameters to zero, while
+  // leaving any other configuration values the same
+  void Resize(int32 input_dim, int32 output_dim);
+  std::string Info() const;
+  void InitFromString(std::string args);
+  std::string Type() const { return "Convolutional1dComponent"; }
+  bool BackpropNeedsInput() const { return true; }
+  bool BackpropNeedsOutput() const { return false; }
+  using Component::Propagate; // to avoid name hiding
+  void Propagate(const ChunkInfo &in_info,
+                 const ChunkInfo &out_info,
+                 const CuMatrixBase<BaseFloat> &in,
+                 CuMatrixBase<BaseFloat> *out) const;
+  void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const UpdatableComponent &other);
+  virtual void Backprop(const ChunkInfo &in_info,
+                        const ChunkInfo &out_info,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *to_update_in,
+                        CuMatrix<BaseFloat> *in_deriv) const;
+  void SetZero(bool treat_as_gradient);
+  void Read(std::istream &is, bool binary);
+  void Write(std::ostream &os, bool binary) const;
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
+  Component* Copy() const;
+  void PerturbParams(BaseFloat stddev);
+  void SetParams(const VectorBase<BaseFloat> &bias,
+                 const MatrixBase<BaseFloat> &filter);
+  const CuVector<BaseFloat> &BiasParams() { return bias_params_; }
+  const CuMatrix<BaseFloat> &LinearParams() { return filter_params_; }
+  int32 GetParameterDim() const;
+  void Update(const CuMatrixBase<BaseFloat> &in_value,
+              const CuMatrixBase<BaseFloat> &out_deriv);
+
+ private:
+  int32 patch_dim_;
+  int32 patch_step_;
+  int32 patch_stride_;
+
+  static void ReverseIndexes(const std::vector<int32> &forward_indexes,
+                             int32 input_dim,
+                             std::vector<std::vector<int32> > *backward_indexes);
+  static void RearrangeIndexes(const std::vector<std::vector<int32> > &in,
+                               std::vector<std::vector<int32> > *out);
+    
+  const Convolutional1dComponent &operator = (const Convolutional1dComponent &other); // Disallow.
+  CuMatrix<BaseFloat> filter_params_;
+  CuVector<BaseFloat> bias_params_;
+  bool is_gradient_;
+};
+
 
 /// Functions used in Init routines.  Suppose name=="foo", if "string" has a
 /// field like foo=12, this function will set "param" to 12 and remove that