From 6548b55a7a3bde33654170793b7610ec2c066b71 Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Fri, 16 Dec 2016 16:06:58 +0800
Subject: [PATCH 01/12] add dropout by row

---
 src/nnet3/nnet-simple-component.cc | 56 ++++++++++++++++++++++++------
 src/nnet3/nnet-simple-component.h  | 16 +++++----
 src/nnet3/nnet-utils.cc            |  3 +-
 src/nnet3/nnet-utils.h             |  2 +-
 4 files changed, 59 insertions(+), 18 deletions(-)
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 58908a0fe09..7d2fa977be1 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -87,27 +87,39 @@ void PnormComponent::Write(std::ostream &os, bool binary) const {
 }
 
 
-void DropoutComponent::Init(int32 dim, BaseFloat dropout_proportion) {
+void DropoutComponent::Init(int32 dim, BaseFloat dropout_proportion, bool dropout_per_frame) {
   dropout_proportion_ = dropout_proportion;
+  dropout_per_frame_ = dropout_per_frame;
   dim_ = dim;
 }
 
 void DropoutComponent::InitFromConfig(ConfigLine *cfl) {
   int32 dim = 0;
   BaseFloat dropout_proportion = 0.0;
+  bool dropout_per_frame = false;
   bool ok = cfl->GetValue("dim", &dim) &&
     cfl->GetValue("dropout-proportion", &dropout_proportion);
+  bool ok2 = cfl->GetValue("dropout-per-frame", &dropout_per_frame);
   if (!ok || cfl->HasUnusedValues() || dim <= 0 ||
       dropout_proportion < 0.0 || dropout_proportion > 1.0)
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
-  Init(dim, dropout_proportion);
+  if( ! ok2 )
+  {
+      dropout_per_frame = false;
+      Init(dim, dropout_proportion, dropout_per_frame);
+  }
+  else
+  {
+      Init(dim, dropout_proportion, dropout_per_frame);
+  }
 }
 
 std::string DropoutComponent::Info() const {
   std::ostringstream stream;
   stream << Type() << ", dim=" << dim_
-         << ", dropout-proportion=" << dropout_proportion_;
+         << ", dropout-proportion=" << dropout_proportion_
+         << ", dropout-per-frame=" << dropout_per_frame_;
   return stream.str();
 }
 
@@ -119,16 +131,36 @@ void DropoutComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
 
   BaseFloat dropout = dropout_proportion_;
   KALDI_ASSERT(dropout >= 0.0 && dropout <= 1.0);
+  if(dropout_per_frame_ == true)
+  {
+    // This const_cast is only safe assuming you don't attempt
+    // to use multi-threaded code with the GPU.
+    const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
 
-  // This const_cast is only safe assuming you don't attempt
-  // to use multi-threaded code with the GPU.
-  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
+    out->Add(-dropout); // now, a proportion "dropout" will be <0.0
+    out->ApplyHeaviside(); // apply the function (x>0?1:0).  Now, a proportion "dropout" will
+                          // be zero and (1 - dropout) will be 1.0.
 
-  out->Add(-dropout); // now, a proportion "dropout" will be <0.0
-  out->ApplyHeaviside(); // apply the function (x>0?1:0).  Now, a proportion "dropout" will
-                         // be zero and (1 - dropout) will be 1.0.
+    out->MulElements(in);
+  }
+  else
+  {
 
-  out->MulElements(in);
+    // This const_cast is only safe assuming you don't attempt
+    // to use multi-threaded code with the GPU.
+    const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
+    out->Add(-dropout); // now, a proportion "dropout" will be <0.0
+    out->ApplyHeaviside(); // apply the function (x>0?1:0).  Now, a proportion "dropout" will
+                           // be zero and (1 - dropout) will be 1.0.
+    CuVector<BaseFloat> *random_drop_vector = new CuVector<BaseFloat>(in.NumRows(), kSetZero);
+    MatrixIndexT i = 0;
+    random_drop_vector->CopyColFromMat(*out, i);
+    for (MatrixIndexT i = 0; i < in.NumCols(); i++)
+    {
+       out->CopyColFromVec(*random_drop_vector, i);
+    }
+    out->MulElements(in);
+  }
 }
 
 
@@ -154,6 +186,8 @@ void DropoutComponent::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &dim_);
   ExpectToken(is, binary, "<DropoutProportion>");
   ReadBasicType(is, binary, &dropout_proportion_);
+  ExpectToken(is, binary, "<DropoutPerFrame>");
+  ReadBasicType(is, binary, &dropout_per_frame_);
   ExpectToken(is, binary, "</DropoutComponent>");
 }
 
@@ -163,6 +197,8 @@ void DropoutComponent::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, dim_);
   WriteToken(os, binary, "<DropoutProportion>");
   WriteBasicType(os, binary, dropout_proportion_);
+  WriteToken(os, binary, "<DropoutPerFrame>");
+  WriteBasicType(os, binary, dropout_per_frame_);
   WriteToken(os, binary, "</DropoutComponent>");
 }
 
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index f09a989759a..b9e86760a46 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -87,11 +87,11 @@ class PnormComponent: public Component {
 // "Dropout: A Simple Way to Prevent Neural Networks from Overfitting".
 class DropoutComponent : public RandomComponent {
  public:
-  void Init(int32 dim, BaseFloat dropout_proportion = 0.0);
+  void Init(int32 dim, BaseFloat dropout_proportion = 0.0, bool dropout_per_frame = false);
 
-  DropoutComponent(int32 dim, BaseFloat dropout = 0.0) { Init(dim, dropout); }
+  DropoutComponent(int32 dim, BaseFloat dropout = 0.0, bool dropout_per_frame = false) { Init(dim, dropout, dropout_per_frame); }
 
-  DropoutComponent(): dim_(0), dropout_proportion_(0.0) { }
+  DropoutComponent(): dim_(0), dropout_proportion_(0.0), dropout_per_frame_(false) { }
 
   virtual int32 Properties() const {
     return kLinearInInput|kBackpropInPlace|kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput;
@@ -120,17 +120,21 @@ class DropoutComponent : public RandomComponent {
                         Component *to_update,
                         CuMatrixBase<BaseFloat> *in_deriv) const;
   virtual Component* Copy() const { return new DropoutComponent(dim_,
-                                                                dropout_proportion_); }
+                                                                dropout_proportion_,
+                                                                dropout_per_frame_); }
   virtual std::string Info() const;
 
-  void SetDropoutProportion(BaseFloat dropout_proportion) { dropout_proportion_ = dropout_proportion; }
+  void SetDropoutProportion(BaseFloat dropout_proportion, bool dropout_per_frame) {
+     dropout_proportion_ = dropout_proportion;
+     dropout_per_frame_ = dropout_per_frame;
+      }
 
  private:
   int32 dim_;
   /// dropout-proportion is the proportion that is dropped out,
   /// e.g. if 0.1, we set 10% to zero value.
   BaseFloat dropout_proportion_;
-
+  bool dropout_per_frame_;
 };
 
 class ElementwiseProductComponent: public Component {
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 55e0f8fcf6a..da02c2965d1 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -524,12 +524,13 @@ std::string NnetInfo(const Nnet &nnet) {
 }
 
 void SetDropoutProportion(BaseFloat dropout_proportion,
+                          bool dropout_per_frame ,
                           Nnet *nnet) {
   for (int32 c = 0; c < nnet->NumComponents(); c++) {
     Component *comp = nnet->GetComponent(c);
     DropoutComponent *dc = dynamic_cast<DropoutComponent*>(comp);
     if (dc != NULL)
-      dc->SetDropoutProportion(dropout_proportion);
+      dc->SetDropoutProportion(dropout_proportion, dropout_per_frame);
   }
 }
 
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index fb304803d1e..ef5be32270a 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -182,7 +182,7 @@ std::string NnetInfo(const Nnet &nnet);
 
 /// This function sets the dropout proportion in all dropout component to 
 /// dropout_proportion value.
-void SetDropoutProportion(BaseFloat dropout_proportion, Nnet *nnet);
+void SetDropoutProportion(BaseFloat dropout_proportion, bool dropout_per_frame, Nnet *nnet);
 
 /// This function finds a list of components that are never used, and outputs
 /// the integer comopnent indexes (you can use these to index

From 23ae7303925820e7569aedfc252b3ad0c65371d7 Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Fri, 16 Dec 2016 16:29:15 +0800
Subject: [PATCH 02/12] now only support by row dropout

---
 src/nnet3/nnet-utils.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index da02c2965d1..eff71f7f268 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -526,6 +526,7 @@ std::string NnetInfo(const Nnet &nnet) {
 void SetDropoutProportion(BaseFloat dropout_proportion,
                           bool dropout_per_frame ,
                           Nnet *nnet) {
+  bool dropout_per_frame = false;
   for (int32 c = 0; c < nnet->NumComponents(); c++) {
     Component *comp = nnet->GetComponent(c);
     DropoutComponent *dc = dynamic_cast<DropoutComponent*>(comp);

From 614a868e42c7c49ba62cd8b09cd57a68c1fe15be Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Fri, 16 Dec 2016 17:42:14 +0800
Subject: [PATCH 03/12] revise

---
 src/nnet3/nnet-chain-combine.cc | 2 +-
 src/nnet3/nnet-combine.cc       | 2 +-
 src/nnet3/nnet-utils.cc         | 8 ++++++--
 src/nnet3/nnet-utils.h          | 2 +-
 src/nnet3bin/nnet3-combine.cc   | 2 +-
 5 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/nnet3/nnet-chain-combine.cc b/src/nnet3/nnet-chain-combine.cc
index dd9b99fe26d..048fe1f449c 100644
--- a/src/nnet3/nnet-chain-combine.cc
+++ b/src/nnet3/nnet-chain-combine.cc
@@ -38,7 +38,7 @@ NnetChainCombiner::NnetChainCombiner(const NnetCombineConfig &combine_config,
     nnet_params_(std::min(num_nnets, combine_config_.max_effective_inputs),
                  NumParameters(first_nnet)),
     tot_input_weighting_(nnet_params_.NumRows()) {
-  SetDropoutProportion(0, &nnet_);
+  SetDropoutProportion(0, false, &nnet_);
   SubVector<BaseFloat> first_params(nnet_params_, 0);
   VectorizeNnet(nnet_, &first_params);
   tot_input_weighting_(0) += 1.0;
diff --git a/src/nnet3/nnet-combine.cc b/src/nnet3/nnet-combine.cc
index 07a96d143c2..7501c9c84dd 100644
--- a/src/nnet3/nnet-combine.cc
+++ b/src/nnet3/nnet-combine.cc
@@ -34,7 +34,7 @@ NnetCombiner::NnetCombiner(const NnetCombineConfig &config,
     nnet_params_(std::min(num_nnets, config_.max_effective_inputs),
                  NumParameters(first_nnet)),
     tot_input_weighting_(nnet_params_.NumRows()) {
-  SetDropoutProportion(0, &nnet_);
+  SetDropoutProportion(0, false, &nnet_);
   SubVector<BaseFloat> first_params(nnet_params_, 0);
   VectorizeNnet(nnet_, &first_params);
   tot_input_weighting_(0) += 1.0;
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index eff71f7f268..216ee955d75 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -526,7 +526,7 @@ std::string NnetInfo(const Nnet &nnet) {
 void SetDropoutProportion(BaseFloat dropout_proportion,
                           bool dropout_per_frame ,
                           Nnet *nnet) {
-  bool dropout_per_frame = false;
+  dropout_per_frame = false;
   for (int32 c = 0; c < nnet->NumComponents(); c++) {
     Component *comp = nnet->GetComponent(c);
     DropoutComponent *dc = dynamic_cast<DropoutComponent*>(comp);
@@ -696,10 +696,14 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) {
       // matches names of components, not nodes.
       config_line.GetValue("name", &name_pattern);
       BaseFloat proportion = -1;
+      bool perframe = false;
       if (!config_line.GetValue("proportion", &proportion)) {
         KALDI_ERR << "In edits-config, expected proportion to be set in line: "
                   << config_line.WholeLine();
       }
+      if (!config_line.GetValue("perframe", &perframe)) {
+        perframe = false;
+      }
       DropoutComponent *component = NULL;
       int32 num_dropout_proportions_set = 0;
       for (int32 c = 0; c < nnet->NumComponents(); c++) {
@@ -707,7 +711,7 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) {
                                name_pattern.c_str()) &&
             (component =
              dynamic_cast<DropoutComponent*>(nnet->GetComponent(c)))) {
-          component->SetDropoutProportion(proportion);
+          component->SetDropoutProportion(proportion, perframe);
           num_dropout_proportions_set++;
         }
       }
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index ef5be32270a..ebb81b8732f 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -233,7 +233,7 @@ void FindOrphanNodes(const Nnet &nnet, std::vector<int32> *nodes);
        remove internal nodes directly; instead you should use the command
        'remove-orphans'.
 
-    set-dropout-proportion [name=<name-pattern>] proportion=<dropout-proportion>
+    set-dropout-proportion [name=<name-pattern>] proportion=<dropout-proportion> perframe=<perframe>
        Sets the dropout rates for any components of type DropoutComponent whose
        names match the given <name-pattern> (e.g. lstm*).  <name-pattern> defaults to "*".
    \endverbatim
diff --git a/src/nnet3bin/nnet3-combine.cc b/src/nnet3bin/nnet3-combine.cc
index 5abc317f054..ee6bfffdac4 100644
--- a/src/nnet3bin/nnet3-combine.cc
+++ b/src/nnet3bin/nnet3-combine.cc
@@ -104,7 +104,7 @@ int main(int argc, char *argv[]) {
     } else {
       KALDI_LOG << "Copying the single input model directly to the output, "
                 << "without any combination.";
-      SetDropoutProportion(0, &nnet);
+      SetDropoutProportion(0, false, &nnet);
       WriteKaldiObject(nnet, nnet_wxfilename, binary_write);
     } 
     KALDI_LOG << "Finished combining neural nets, wrote model to "

From c1d1ad112c482052d309c76f1892c73b332af9ab Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Sat, 17 Dec 2016 18:04:09 +0800
Subject: [PATCH 04/12] adding scripts level dropout-by-row code

and fix some issues
---
 .../s5b/local/chain/tuning/run_tdnn_lstm_1i_dp.sh |  8 +++++---
 .../libs/nnet3/train/chain_objf/acoustic_model.py |  3 ++-
 egs/wsj/s5/steps/libs/nnet3/train/common.py       | 15 ++++++++++-----
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py       |  8 +++++++-
 egs/wsj/s5/steps/nnet3/chain/train.py             |  8 +++++++-
 src/nnet3/nnet-simple-component.cc                | 10 +++-------
 src/nnet3/nnet-utils.cc                           | 10 +++++-----
 src/nnet3/nnet-utils.h                            |  2 +-
 8 files changed, 40 insertions(+), 24 deletions(-)

diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i_dp.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i_dp.sh
index ea7c01b79ec..16e2e4b7bcf 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i_dp.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i_dp.sh
@@ -29,6 +29,7 @@ ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
 num_threads_ubm=32
 nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
 dropout_schedule='0,0@0.20,0.5@0.50,0@0.50,0'
+dropout_per_frame=false
 chunk_width=150
 chunk_left_context=40
 chunk_right_context=0
@@ -193,15 +194,15 @@ if [ $stage -le 15 ]; then
   relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
 
   # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
-  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0
+  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=false
   relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
   relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
   relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
-  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0
+  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=false
   relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
   relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024
   relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024
-  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0
+  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=false
 
   ## adding the layers for chain branch
   output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
@@ -243,6 +244,7 @@ if [ $stage -le 16 ]; then
     --egs.chunk-left-context $chunk_left_context \
     --egs.chunk-right-context $chunk_right_context \
     --trainer.dropout-schedule $dropout_schedule \
+    --trainer.dropout-per-frame $dropout_per_frame \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.frames-per-iter 1500000 \
     --trainer.num-epochs 4 \
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index bcc876832dd..e1109fea166 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -225,6 +225,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                         frame_subsampling_factor, truncate_deriv_weights,
                         run_opts,
                         dropout_proportions=None,
+                        dropout_per_frame=None,
                         background_process_handler=None):
     """ Called from steps/nnet3/chain/train.py for one iteration for
     neural network training with LF-MMI objective
@@ -307,7 +308,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
     dropout_info_str = ''
     if dropout_proportions is not None:
         raw_model_string, dropout_info = common_train_lib.apply_dropout(
-            dropout_proportions, raw_model_string)
+            dropout_proportions, dropout_per_frame, raw_model_string)
         dropout_info_str = ', {0}'.format(", ".join(dropout_info))
 
     shrink_info_str = ''
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 6d255186cf4..952d64cab4c 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -511,7 +511,7 @@ def _get_component_dropout(dropout_schedule, num_archives_processed):
             + initial_dropout)
 
 
-def apply_dropout(dropout_proportions, raw_model_string):
+def apply_dropout(dropout_proportions, dropout_per_frame, raw_model_string):
     """Adds an nnet3-copy --edits line to modify raw_model_string to
     set dropout proportions according to dropout_proportions.
 
@@ -523,10 +523,10 @@ def apply_dropout(dropout_proportions, raw_model_string):
 
     for component_name, dropout_proportion in dropout_proportions:
         edit_config_lines.append(
-            "set-dropout-proportion name={0} proportion={1}".format(
-                component_name, dropout_proportion))
-        dropout_info.append("pattern/dropout-proportion={0}/{1}".format(
-            component_name, dropout_proportion))
+            "set-dropout-proportion name={0} proportion={1} dropout-per-frame={2}".format(
+                component_name, dropout_proportion, dropout_per_frame))
+        dropout_info.append("pattern/dropout-proportion={0}/{1} dropout-per-frame={2}".format(
+            component_name, dropout_proportion, dropout_per_frame))
 
     return ("""{raw_model_string} nnet3-copy --edits='{edits}' \
             - - |""".format(raw_model_string=raw_model_string,
@@ -771,6 +771,11 @@ def __init__(self):
                                  lstm*=0,0.2,0'.  More general should precede
                                  less general patterns, as they are applied
                                  sequentially.""")
+        self.parser.add_argument("--trainer.dropout-per-frame", type=str,
+                                 action=common_lib.NullstrToNoneAction,
+                                 dest='dropout_per_frame', default=None,
+                                 help="""this option is used to control whether
+                                 using dropout by frame level or by vector level""")
 
         # General options
         self.parser.add_argument("--stage", type=int, default=-4,
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 023eece93da..bc56be8e8f1 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -251,6 +251,7 @@ def set_default_configs(self):
                         'zeroing-interval' : 20,
                         'zeroing-threshold' : 15.0,
                         'dropout-proportion' : -1.0 # -1.0 stands for no dropout will be added
+                        'dropout-per-frame' : 'false'
                        }
 
     def set_derived_configs(self):
@@ -285,6 +286,10 @@ def check_configs(self):
              self.config['dropout-proportion'] < 0.0) and
              self.config['dropout-proportion'] != -1.0 ):
              raise xparser_error("dropout-proportion has invalid value {0}.".format(self.config['dropout-proportion']))
+        
+        if (self.config['dropout-per-frame'] != 'false' or
+            self.config['dropout-per-frame'] != 'true'):
+            raise xparser_error("dropout-per-frame has invalid value {0}.".format(self.config['dropout-per-frame']))
 
     def auxiliary_outputs(self):
         return ['c_t']
@@ -347,7 +352,8 @@ def generate_lstm_config(self):
         pes_str = self.config['ng-per-element-scale-options']
         lstm_dropout_value = self.config['dropout-proportion']
         lstm_dropout_str = 'dropout-proportion='+str(self.config['dropout-proportion'])
-
+        lstm_dropout_per_frame_value = self.config['dropout-per-frame']
+        lstm_dropout_per_frame_str = 'dropout-per-frame='+str(self.config['dropout-per-frame'])
         # Natural gradient per element scale parameters
         # TODO: decide if we want to keep exposing these options
         if re.search('param-mean', pes_str) is None and \
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 7aabf02e86b..9d497b872b3 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -202,7 +202,10 @@ def process_args(args):
             "value={0}. We recommend using the option "
             "--trainer.deriv-truncate-margin.".format(
                 args.deriv_truncate_margin))
-
+    if ( args.dropout_schedule is None ) 
+            and (args.dropout_per_frame is not None) :
+        raise Exception("The dropout schedule is null, but dropout_per_frame"
+                        "option is not null")
     if (not os.path.exists(args.dir)
             or not os.path.exists(args.dir+"/configs")):
         raise Exception("This scripts expects {0} to exist and have a configs "
@@ -441,6 +444,9 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                     None if args.dropout_schedule is None
                     else common_train_lib.get_dropout_proportions(
                         dropout_schedule, num_archives_processed)),
+                dropout_per_frame=(
+                    None if args.dropout_schedule is None
+                    else args.dropout_per_frame),
                 shrinkage_value=shrinkage_value,
                 num_chunk_per_minibatch=args.num_chunk_per_minibatch,
                 num_hidden_layers=num_hidden_layers,
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 7d2fa977be1..722898f7eaf 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -108,9 +108,7 @@ void DropoutComponent::InitFromConfig(ConfigLine *cfl) {
   {
       dropout_per_frame = false;
       Init(dim, dropout_proportion, dropout_per_frame);
-  }
-  else
-  {
+  } else {
       Init(dim, dropout_proportion, dropout_per_frame);
   }
 }
@@ -131,7 +129,7 @@ void DropoutComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
 
   BaseFloat dropout = dropout_proportion_;
   KALDI_ASSERT(dropout >= 0.0 && dropout <= 1.0);
-  if(dropout_per_frame_ == true)
+  if(dropout_per_frame_)
   {
     // This const_cast is only safe assuming you don't attempt
     // to use multi-threaded code with the GPU.
@@ -142,9 +140,7 @@ void DropoutComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
                           // be zero and (1 - dropout) will be 1.0.
 
     out->MulElements(in);
-  }
-  else
-  {
+  } else {
 
     // This const_cast is only safe assuming you don't attempt
     // to use multi-threaded code with the GPU.
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 216ee955d75..973dea3b913 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -524,7 +524,7 @@ std::string NnetInfo(const Nnet &nnet) {
 }
 
 void SetDropoutProportion(BaseFloat dropout_proportion,
-                          bool dropout_per_frame ,
+                          bool dropout_per_frame,
                           Nnet *nnet) {
   dropout_per_frame = false;
   for (int32 c = 0; c < nnet->NumComponents(); c++) {
@@ -696,13 +696,13 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) {
       // matches names of components, not nodes.
       config_line.GetValue("name", &name_pattern);
       BaseFloat proportion = -1;
-      bool perframe = false;
+      bool dropout_per_frame = false;
       if (!config_line.GetValue("proportion", &proportion)) {
         KALDI_ERR << "In edits-config, expected proportion to be set in line: "
                   << config_line.WholeLine();
       }
-      if (!config_line.GetValue("perframe", &perframe)) {
-        perframe = false;
+      if (!config_line.GetValue("dropout-per-frame", &dropout_per_frame)) {
+        dropout_per_frame = false;
       }
       DropoutComponent *component = NULL;
       int32 num_dropout_proportions_set = 0;
@@ -711,7 +711,7 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) {
                                name_pattern.c_str()) &&
             (component =
              dynamic_cast<DropoutComponent*>(nnet->GetComponent(c)))) {
-          component->SetDropoutProportion(proportion, perframe);
+          component->SetDropoutProportion(proportion, dropout_per_frame);
           num_dropout_proportions_set++;
         }
       }
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index ebb81b8732f..1d186cc0600 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -233,7 +233,7 @@ void FindOrphanNodes(const Nnet &nnet, std::vector<int32> *nodes);
        remove internal nodes directly; instead you should use the command
        'remove-orphans'.
 
-    set-dropout-proportion [name=<name-pattern>] proportion=<dropout-proportion> perframe=<perframe>
+    set-dropout-proportion [name=<name-pattern>] proportion=<dropout-proportion> dropout-per-frame=<dropout-per-frame>
        Sets the dropout rates for any components of type DropoutComponent whose
        names match the given <name-pattern> (e.g. lstm*).  <name-pattern> defaults to "*".
    \endverbatim

From 14662b65204f9551f9fad031955055cdcb0643db Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Sat, 17 Dec 2016 21:12:56 +0800
Subject: [PATCH 05/12] adding kernel heavybyrow

---
 src/cudamatrix/cu-kernels.cu       | 21 +++++++++++++++++++++
 src/cudamatrix/cu-kernels.h        |  3 +++
 src/cudamatrix/cu-matrix.cc        | 17 +++++++++++++++++
 src/cudamatrix/cu-matrix.h         |  1 +
 src/nnet3/nnet-simple-component.cc | 11 ++---------
 5 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 795b4321413..e06dfbe56ac 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -1628,6 +1628,23 @@ static void _apply_heaviside(Real* mat, MatrixDim d) {
     mat[index] = (mat[index] > 0.0 ? 1.0 : 0.0);
 }
 
+template<typename Real>
+__global__
+static void _apply_heaviside_by_row(Real* mat, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  int j_tempt = blockIdx.y * blockDim.y + threadIdx.y;  // row index using to control setting heavyside() in the first rows
+  int index = i + j * d.stride;
+  if (i < d.cols && j < d.rows)
+    if (j = j_ref) {
+      mat[index] = (mat[index] > 0.0 ? 1.0 : 0.0);
+    }
+    else {
+      mat[index] = mat[index-d.stride-d.cols]
+    }
+}
+
+
 template<typename Real>
 __global__
 static void _apply_floor(Real* mat, Real floor_val, MatrixDim d) {
@@ -3233,6 +3250,10 @@ void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
   _apply_heaviside<<<Gr,Bl>>>(mat, d);
 }
 
+void cudaF_apply_heaviside_by_row(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
+  _apply_heaviside_by_row<<<Gr,Bl>>>(mat, d);
+}
+
 void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
                      const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                      int src_stride) {
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 9e9910d6f56..71493ad8bd6 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -201,6 +201,9 @@ inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
 inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) {
   cudaF_apply_heaviside(Gr, Bl, mat, dim);
 }
+inline void cuda_apply_heaviside_by_row(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) {
+  cudaF_apply_heaviside_by_row(Gr, Bl, mat, dim);
+}
 inline void cuda_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
                              MatrixDim dim) {
   cudaF_apply_floor(Gr, Bl, mat, floor_val, dim);
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index f16b7f0bf52..499949c6bcb 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -2207,6 +2207,23 @@ void CuMatrixBase<Real>::ApplyHeaviside() {
   }
 }
 
+template<typename Real>
+void CuMatrixBase<Real>::ApplyHeavisideByRow() {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_apply_heaviside_by_row(dimGrid, dimBlock, data_, Dim());
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    KALDI_ERR << "no ApplyHeavisideByRow implemented without CUDA";
+  }
+}
 template<typename Real>
 void CuMatrixBase<Real>::Heaviside(const CuMatrixBase<Real> &src) {
   KALDI_ASSERT(SameDim(*this, src));
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 38a6c25071b..efac59b5aaf 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -369,6 +369,7 @@ class CuMatrixBase {
   /// For each element, sets x = (x > 0 ? 1.0 : 0.0).
   /// See also Heaviside().
   void ApplyHeaviside();
+  void ApplyHeavisideByRow();
   void ApplyFloor(Real floor_val);
   void ApplyCeiling(Real ceiling_val);
   void ApplyExp();
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 722898f7eaf..c79af655715 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -146,15 +146,8 @@ void DropoutComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
     // to use multi-threaded code with the GPU.
     const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
     out->Add(-dropout); // now, a proportion "dropout" will be <0.0
-    out->ApplyHeaviside(); // apply the function (x>0?1:0).  Now, a proportion "dropout" will
-                           // be zero and (1 - dropout) will be 1.0.
-    CuVector<BaseFloat> *random_drop_vector = new CuVector<BaseFloat>(in.NumRows(), kSetZero);
-    MatrixIndexT i = 0;
-    random_drop_vector->CopyColFromMat(*out, i);
-    for (MatrixIndexT i = 0; i < in.NumCols(); i++)
-    {
-       out->CopyColFromVec(*random_drop_vector, i);
-    }
+    out->ApplyHeavisideByRow(); // apply the function (x>0?1:0).  Now, a proportion "dropout" will
+                           // be zero and (1 - dropout) will be 1.0 by row.
     out->MulElements(in);
   }
 }

From 1d22219c09989a096dc0e45e7a29edd781b7ea8a Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Sat, 17 Dec 2016 21:49:31 +0800
Subject: [PATCH 06/12] add cuda kernel to realize random-matrix-by row

---
 src/cudamatrix/cu-kernels-ansi.h | 2 ++
 src/cudamatrix/cu-kernels.cu     | 8 ++++++--
 src/cudamatrix/cu-kernels.h      | 3 +++
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 878ba216407..554837049e3 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -64,6 +64,7 @@ void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d);
 void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
                          bool include_sign, MatrixDim d);
 void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
+void cudaF_apply_heaviside_by_row(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
 void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
                        MatrixDim d);
 void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
@@ -330,6 +331,7 @@ void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d);
 void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
                          bool include_sign, MatrixDim d);
 void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
+void cudaD_apply_heaviside_by_row(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
 void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
                        MatrixDim d);
 void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index e06dfbe56ac..98e03b02b37 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -1636,11 +1636,11 @@ static void _apply_heaviside_by_row(Real* mat, MatrixDim d) {
   int j_tempt = blockIdx.y * blockDim.y + threadIdx.y;  // row index using to control setting heavyside() in the first rows
   int index = i + j * d.stride;
   if (i < d.cols && j < d.rows)
-    if (j = j_ref) {
+    if (j = j_tempt) {
       mat[index] = (mat[index] > 0.0 ? 1.0 : 0.0);
     }
     else {
-      mat[index] = mat[index-d.stride-d.cols]
+      mat[index] = mat[index-d.stride-d.cols];
     }
 }
 
@@ -3901,6 +3901,10 @@ void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
   _apply_heaviside<<<Gr,Bl>>>(mat, d);
 }
 
+void cudaD_apply_heaviside_by_row(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
+  _apply_heaviside_by_row<<<Gr,Bl>>>(mat, d);
+}
+
 void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
                      const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                      int src_stride) {
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 71493ad8bd6..58432f290f7 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -742,6 +742,9 @@ inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
 inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) {
   cudaD_apply_heaviside(Gr, Bl, mat, dim);
 }
+inline void cuda_apply_heaviside_by_row(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) {
+  cudaD_apply_heaviside_by_row(Gr, Bl, mat, dim);
+}
 inline void cuda_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
                              MatrixDim dim) {
   cudaD_apply_floor(Gr, Bl, mat, floor_val, dim);

From 5b8b98b8c15fe3d9ce1917600e1d06d726fe7ec3 Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Wed, 21 Dec 2016 21:45:19 +0800
Subject: [PATCH 07/12] Revert "add cuda kernel to realize random-matrix-by
 row"

This reverts commit 1d22219c09989a096dc0e45e7a29edd781b7ea8a.
---
 src/cudamatrix/cu-kernels-ansi.h | 2 --
 src/cudamatrix/cu-kernels.cu     | 8 ++------
 src/cudamatrix/cu-kernels.h      | 3 ---
 3 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 554837049e3..878ba216407 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -64,7 +64,6 @@ void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d);
 void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
                          bool include_sign, MatrixDim d);
 void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
-void cudaF_apply_heaviside_by_row(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
 void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
                        MatrixDim d);
 void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
@@ -331,7 +330,6 @@ void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d);
 void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
                          bool include_sign, MatrixDim d);
 void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
-void cudaD_apply_heaviside_by_row(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
 void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
                        MatrixDim d);
 void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 98e03b02b37..e06dfbe56ac 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -1636,11 +1636,11 @@ static void _apply_heaviside_by_row(Real* mat, MatrixDim d) {
   int j_tempt = blockIdx.y * blockDim.y + threadIdx.y;  // row index using to control setting heavyside() in the first rows
   int index = i + j * d.stride;
   if (i < d.cols && j < d.rows)
-    if (j = j_tempt) {
+    if (j = j_ref) {
       mat[index] = (mat[index] > 0.0 ? 1.0 : 0.0);
     }
     else {
-      mat[index] = mat[index-d.stride-d.cols];
+      mat[index] = mat[index-d.stride-d.cols]
     }
 }
 
@@ -3901,10 +3901,6 @@ void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
   _apply_heaviside<<<Gr,Bl>>>(mat, d);
 }
 
-void cudaD_apply_heaviside_by_row(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
-  _apply_heaviside_by_row<<<Gr,Bl>>>(mat, d);
-}
-
 void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
                      const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                      int src_stride) {
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 58432f290f7..71493ad8bd6 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -742,9 +742,6 @@ inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
 inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) {
   cudaD_apply_heaviside(Gr, Bl, mat, dim);
 }
-inline void cuda_apply_heaviside_by_row(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) {
-  cudaD_apply_heaviside_by_row(Gr, Bl, mat, dim);
-}
 inline void cuda_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
                              MatrixDim dim) {
   cudaD_apply_floor(Gr, Bl, mat, floor_val, dim);

From 4137c9d0c117a804dcca78016e412f35e06b39fc Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Wed, 21 Dec 2016 21:45:37 +0800
Subject: [PATCH 08/12] Revert "adding kernel heavybyrow"

This reverts commit 14662b65204f9551f9fad031955055cdcb0643db.
---
 src/cudamatrix/cu-kernels.cu       | 21 ---------------------
 src/cudamatrix/cu-kernels.h        |  3 ---
 src/cudamatrix/cu-matrix.cc        | 17 -----------------
 src/cudamatrix/cu-matrix.h         |  1 -
 src/nnet3/nnet-simple-component.cc | 11 +++++++++--
 5 files changed, 9 insertions(+), 44 deletions(-)

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index e06dfbe56ac..795b4321413 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -1628,23 +1628,6 @@ static void _apply_heaviside(Real* mat, MatrixDim d) {
     mat[index] = (mat[index] > 0.0 ? 1.0 : 0.0);
 }
 
-template<typename Real>
-__global__
-static void _apply_heaviside_by_row(Real* mat, MatrixDim d) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
-  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
-  int j_tempt = blockIdx.y * blockDim.y + threadIdx.y;  // row index using to control setting heavyside() in the first rows
-  int index = i + j * d.stride;
-  if (i < d.cols && j < d.rows)
-    if (j = j_ref) {
-      mat[index] = (mat[index] > 0.0 ? 1.0 : 0.0);
-    }
-    else {
-      mat[index] = mat[index-d.stride-d.cols]
-    }
-}
-
-
 template<typename Real>
 __global__
 static void _apply_floor(Real* mat, Real floor_val, MatrixDim d) {
@@ -3250,10 +3233,6 @@ void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
   _apply_heaviside<<<Gr,Bl>>>(mat, d);
 }
 
-void cudaF_apply_heaviside_by_row(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
-  _apply_heaviside_by_row<<<Gr,Bl>>>(mat, d);
-}
-
 void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
                      const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                      int src_stride) {
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 71493ad8bd6..9e9910d6f56 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -201,9 +201,6 @@ inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
 inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) {
   cudaF_apply_heaviside(Gr, Bl, mat, dim);
 }
-inline void cuda_apply_heaviside_by_row(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) {
-  cudaF_apply_heaviside_by_row(Gr, Bl, mat, dim);
-}
 inline void cuda_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
                              MatrixDim dim) {
   cudaF_apply_floor(Gr, Bl, mat, floor_val, dim);
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 499949c6bcb..f16b7f0bf52 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -2207,23 +2207,6 @@ void CuMatrixBase<Real>::ApplyHeaviside() {
   }
 }
 
-template<typename Real>
-void CuMatrixBase<Real>::ApplyHeavisideByRow() {
-#if HAVE_CUDA == 1
-  if (CuDevice::Instantiate().Enabled()) {
-    Timer tim;
-    dim3 dimGrid, dimBlock;
-    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
-                                          &dimGrid, &dimBlock);
-    cuda_apply_heaviside_by_row(dimGrid, dimBlock, data_, Dim());
-    CU_SAFE_CALL(cudaGetLastError());
-    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
-  } else
-#endif
-  {
-    KALDI_ERR << "no ApplyHeavisideByRow implemented without CUDA";
-  }
-}
 template<typename Real>
 void CuMatrixBase<Real>::Heaviside(const CuMatrixBase<Real> &src) {
   KALDI_ASSERT(SameDim(*this, src));
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index efac59b5aaf..38a6c25071b 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -369,7 +369,6 @@ class CuMatrixBase {
   /// For each element, sets x = (x > 0 ? 1.0 : 0.0).
   /// See also Heaviside().
   void ApplyHeaviside();
-  void ApplyHeavisideByRow();
   void ApplyFloor(Real floor_val);
   void ApplyCeiling(Real ceiling_val);
   void ApplyExp();
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index c79af655715..722898f7eaf 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -146,8 +146,15 @@ void DropoutComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
     // to use multi-threaded code with the GPU.
     const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
     out->Add(-dropout); // now, a proportion "dropout" will be <0.0
-    out->ApplyHeavisideByRow(); // apply the function (x>0?1:0).  Now, a proportion "dropout" will
-                           // be zero and (1 - dropout) will be 1.0 by row.
+    out->ApplyHeaviside(); // apply the function (x>0?1:0).  Now, a proportion "dropout" will
+                           // be zero and (1 - dropout) will be 1.0.
+    CuVector<BaseFloat> *random_drop_vector = new CuVector<BaseFloat>(in.NumRows(), kSetZero);
+    MatrixIndexT i = 0;
+    random_drop_vector->CopyColFromMat(*out, i);
+    for (MatrixIndexT i = 0; i < in.NumCols(); i++)
+    {
+       out->CopyColFromVec(*random_drop_vector, i);
+    }
     out->MulElements(in);
   }
 }

From d721e59658284d8fbccf40bfae46e68ee8f82226 Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Thu, 22 Dec 2016 00:37:53 +0800
Subject: [PATCH 09/12] updating existing best scripts

more experiments are on the way (different places etc...)
---
 .../local/chain/tuning/run_tdnn_lstm_1i_dp.sh | 20 ++++----
 .../nnet3/train/chain_objf/acoustic_model.py  |  3 +-
 egs/wsj/s5/steps/libs/nnet3/train/common.py   | 15 ++----
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py   |  6 +--
 egs/wsj/s5/steps/nnet3/chain/train.py         |  7 ---
 src/nnet3/nnet-chain-combine.cc               |  2 +-
 src/nnet3/nnet-combine.cc                     |  2 +-
 src/nnet3/nnet-simple-component.cc            | 49 ++++++++++---------
 src/nnet3/nnet-simple-component.h             |  7 ++-
 src/nnet3/nnet-utils.cc                       | 10 +---
 src/nnet3/nnet-utils.h                        |  4 +-
 src/nnet3bin/nnet3-combine.cc                 |  2 +-
 12 files changed, 54 insertions(+), 73 deletions(-)

diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i_dp.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i_dp.sh
index 16e2e4b7bcf..ee247135b67 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i_dp.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i_dp.sh
@@ -5,13 +5,13 @@
 # same as 1i but with frame level dropout
 # (num-params 1g:21309812 1i: 43447156)
 # results on sdm1 using ihm ali
-#System            tdnn_lstm1i_sp_bi_ihmali_ld5
-#WER on dev        37.6            36.7
-#WER on eval       40.9            39.9
-#Final train prob      -0.114135   -0.118
-#Final valid prob      -0.245208   -0.246
-#Final train prob (xent)      -1.47648  -1.54
-#Final valid prob (xent)      -2.16365  -2.10
+#System   tdnn_lstm1i_sp_bi_ihmali_ld5 tdnn_lstm1i_dp_sp_bi_ihmali_ld5
+#WER on dev        37.6            36.5
+#WER on eval       40.9            39.7
+#Final train prob      -0.114135   -0.124
+#Final valid prob      -0.245208   -0.249
+#Final train prob (xent)      -1.47648  -1.55
+#Final valid prob (xent)      -2.16365  -2.11
 
 
 set -e -o pipefail
@@ -28,8 +28,7 @@ gmm=tri3_cleaned  # the gmm for the target data
 ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
 num_threads_ubm=32
 nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
-dropout_schedule='0,0@0.20,0.5@0.50,0@0.50,0'
-dropout_per_frame=false
+dropout_schedule='0,0@0.20,0.5@0.5,0@0.75,0'
 chunk_width=150
 chunk_left_context=40
 chunk_right_context=0
@@ -38,7 +37,7 @@ label_delay=5
 # are just hardcoded at this level, in the commands below.
 train_stage=-10
 tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tlstm_affix=1i  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+tlstm_affix=1i_dp  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
 common_egs_dir=  # you can set this to use previously dumped egs.
 
 
@@ -244,7 +243,6 @@ if [ $stage -le 16 ]; then
     --egs.chunk-left-context $chunk_left_context \
     --egs.chunk-right-context $chunk_right_context \
     --trainer.dropout-schedule $dropout_schedule \
-    --trainer.dropout-per-frame $dropout_per_frame \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.frames-per-iter 1500000 \
     --trainer.num-epochs 4 \
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index e1109fea166..bcc876832dd 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -225,7 +225,6 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                         frame_subsampling_factor, truncate_deriv_weights,
                         run_opts,
                         dropout_proportions=None,
-                        dropout_per_frame=None,
                         background_process_handler=None):
     """ Called from steps/nnet3/chain/train.py for one iteration for
     neural network training with LF-MMI objective
@@ -308,7 +307,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
     dropout_info_str = ''
     if dropout_proportions is not None:
         raw_model_string, dropout_info = common_train_lib.apply_dropout(
-            dropout_proportions, dropout_per_frame, raw_model_string)
+            dropout_proportions, raw_model_string)
         dropout_info_str = ', {0}'.format(", ".join(dropout_info))
 
     shrink_info_str = ''
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 952d64cab4c..6d255186cf4 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -511,7 +511,7 @@ def _get_component_dropout(dropout_schedule, num_archives_processed):
             + initial_dropout)
 
 
-def apply_dropout(dropout_proportions, dropout_per_frame, raw_model_string):
+def apply_dropout(dropout_proportions, raw_model_string):
     """Adds an nnet3-copy --edits line to modify raw_model_string to
     set dropout proportions according to dropout_proportions.
 
@@ -523,10 +523,10 @@ def apply_dropout(dropout_proportions, dropout_per_frame, raw_model_string):
 
     for component_name, dropout_proportion in dropout_proportions:
         edit_config_lines.append(
-            "set-dropout-proportion name={0} proportion={1} dropout-per-frame={2}".format(
-                component_name, dropout_proportion, dropout_per_frame))
-        dropout_info.append("pattern/dropout-proportion={0}/{1} dropout-per-frame={2}".format(
-            component_name, dropout_proportion, dropout_per_frame))
+            "set-dropout-proportion name={0} proportion={1}".format(
+                component_name, dropout_proportion))
+        dropout_info.append("pattern/dropout-proportion={0}/{1}".format(
+            component_name, dropout_proportion))
 
     return ("""{raw_model_string} nnet3-copy --edits='{edits}' \
             - - |""".format(raw_model_string=raw_model_string,
@@ -771,11 +771,6 @@ def __init__(self):
                                  lstm*=0,0.2,0'.  More general should precede
                                  less general patterns, as they are applied
                                  sequentially.""")
-        self.parser.add_argument("--trainer.dropout-per-frame", type=str,
-                                 action=common_lib.NullstrToNoneAction,
-                                 dest='dropout_per_frame', default=None,
-                                 help="""this option is used to control whether
-                                 using dropout by frame level or by vector level""")
 
         # General options
         self.parser.add_argument("--stage", type=int, default=-4,
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index bc56be8e8f1..be4e93291af 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -250,7 +250,7 @@ def set_default_configs(self):
                         'self-repair-scale-nonlinearity' : 0.00001,
                         'zeroing-interval' : 20,
                         'zeroing-threshold' : 15.0,
-                        'dropout-proportion' : -1.0 # -1.0 stands for no dropout will be added
+                        'dropout-proportion' : -1.0 ,# -1.0 stands for no dropout will be added
                         'dropout-per-frame' : 'false'
                        }
 
@@ -287,7 +287,7 @@ def check_configs(self):
              self.config['dropout-proportion'] != -1.0 ):
              raise xparser_error("dropout-proportion has invalid value {0}.".format(self.config['dropout-proportion']))
         
-        if (self.config['dropout-per-frame'] != 'false' or
+        if (self.config['dropout-per-frame'] != 'false' and
             self.config['dropout-per-frame'] != 'true'):
             raise xparser_error("dropout-per-frame has invalid value {0}.".format(self.config['dropout-per-frame']))
 
@@ -433,7 +433,7 @@ def generate_lstm_config(self):
         # add the recurrent connections
         configs.append("# projection matrices : Wrm and Wpm")
         if lstm_dropout_value != -1.0:
-            configs.append("component name={0}.W_rp.m.dropout type=DropoutComponent dim={1} {2}".format(name, cell_dim, lstm_dropout_str))
+            configs.append("component name={0}.rp_t.dropout type=DropoutComponent dim={1} {2} {3}".format(name, cell_dim, lstm_dropout_str, lstm_dropout_per_frame_str))
         configs.append("component name={0}.W_rp.m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
         configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
 
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 9d497b872b3..3135cab5ecf 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -202,10 +202,6 @@ def process_args(args):
             "value={0}. We recommend using the option "
             "--trainer.deriv-truncate-margin.".format(
                 args.deriv_truncate_margin))
-    if ( args.dropout_schedule is None ) 
-            and (args.dropout_per_frame is not None) :
-        raise Exception("The dropout schedule is null, but dropout_per_frame"
-                        "option is not null")
     if (not os.path.exists(args.dir)
             or not os.path.exists(args.dir+"/configs")):
         raise Exception("This scripts expects {0} to exist and have a configs "
@@ -444,9 +440,6 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                     None if args.dropout_schedule is None
                     else common_train_lib.get_dropout_proportions(
                         dropout_schedule, num_archives_processed)),
-                dropout_per_frame=(
-                    None if args.dropout_schedule is None
-                    else args.dropout_per_frame),
                 shrinkage_value=shrinkage_value,
                 num_chunk_per_minibatch=args.num_chunk_per_minibatch,
                 num_hidden_layers=num_hidden_layers,
diff --git a/src/nnet3/nnet-chain-combine.cc b/src/nnet3/nnet-chain-combine.cc
index 048fe1f449c..dd9b99fe26d 100644
--- a/src/nnet3/nnet-chain-combine.cc
+++ b/src/nnet3/nnet-chain-combine.cc
@@ -38,7 +38,7 @@ NnetChainCombiner::NnetChainCombiner(const NnetCombineConfig &combine_config,
     nnet_params_(std::min(num_nnets, combine_config_.max_effective_inputs),
                  NumParameters(first_nnet)),
     tot_input_weighting_(nnet_params_.NumRows()) {
-  SetDropoutProportion(0, false, &nnet_);
+  SetDropoutProportion(0, &nnet_);
   SubVector<BaseFloat> first_params(nnet_params_, 0);
   VectorizeNnet(nnet_, &first_params);
   tot_input_weighting_(0) += 1.0;
diff --git a/src/nnet3/nnet-combine.cc b/src/nnet3/nnet-combine.cc
index 7501c9c84dd..07a96d143c2 100644
--- a/src/nnet3/nnet-combine.cc
+++ b/src/nnet3/nnet-combine.cc
@@ -34,7 +34,7 @@ NnetCombiner::NnetCombiner(const NnetCombineConfig &config,
     nnet_params_(std::min(num_nnets, config_.max_effective_inputs),
                  NumParameters(first_nnet)),
     tot_input_weighting_(nnet_params_.NumRows()) {
-  SetDropoutProportion(0, false, &nnet_);
+  SetDropoutProportion(0, &nnet_);
   SubVector<BaseFloat> first_params(nnet_params_, 0);
   VectorizeNnet(nnet_, &first_params);
   tot_input_weighting_(0) += 1.0;
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 722898f7eaf..e2889415f4c 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -98,19 +98,14 @@ void DropoutComponent::InitFromConfig(ConfigLine *cfl) {
   BaseFloat dropout_proportion = 0.0;
   bool dropout_per_frame = false;
   bool ok = cfl->GetValue("dim", &dim) &&
-    cfl->GetValue("dropout-proportion", &dropout_proportion);
-  bool ok2 = cfl->GetValue("dropout-per-frame", &dropout_per_frame);
+    cfl->GetValue("dropout-proportion", &dropout_proportion) &&
+    cfl->GetValue("dropout-per-frame", &dropout_per_frame);
   if (!ok || cfl->HasUnusedValues() || dim <= 0 ||
-      dropout_proportion < 0.0 || dropout_proportion > 1.0)
+      dropout_proportion < 0.0 || dropout_proportion > 1.0 ||
+     (dropout_per_frame != false and dropout_per_frame != true))
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
-  if( ! ok2 )
-  {
-      dropout_per_frame = false;
-      Init(dim, dropout_proportion, dropout_per_frame);
-  } else {
-      Init(dim, dropout_proportion, dropout_per_frame);
-  }
+  Init(dim, dropout_proportion, dropout_per_frame);
 }
 
 std::string DropoutComponent::Info() const {
@@ -146,15 +141,12 @@ void DropoutComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
     // to use multi-threaded code with the GPU.
     const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
     out->Add(-dropout); // now, a proportion "dropout" will be <0.0
-    out->ApplyHeaviside(); // apply the function (x>0?1:0).  Now, a proportion "dropout" will
-                           // be zero and (1 - dropout) will be 1.0.
+    out->ApplyHeaviside();
     CuVector<BaseFloat> *random_drop_vector = new CuVector<BaseFloat>(in.NumRows(), kSetZero);
     MatrixIndexT i = 0;
     random_drop_vector->CopyColFromMat(*out, i);
-    for (MatrixIndexT i = 0; i < in.NumCols(); i++)
-    {
-       out->CopyColFromVec(*random_drop_vector, i);
-    }
+    out->SetZero();
+    out->AddVecToCols(1.0 , *random_drop_vector, 1.0);
     out->MulElements(in);
   }
 }
@@ -178,13 +170,24 @@ void DropoutComponent::Backprop(const std::string &debug_info,
 
 
 void DropoutComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<DropoutComponent>", "<Dim>");
-  ReadBasicType(is, binary, &dim_);
-  ExpectToken(is, binary, "<DropoutProportion>");
-  ReadBasicType(is, binary, &dropout_proportion_);
-  ExpectToken(is, binary, "<DropoutPerFrame>");
-  ReadBasicType(is, binary, &dropout_per_frame_);
-  ExpectToken(is, binary, "</DropoutComponent>");
+  //back-compatibility code.
+  std::string token;
+  ReadToken(is, binary, &token);
+  if(token == "<DropoutComponent>"){
+    ReadToken(is, binary, &token);
+  }
+  KALDI_ASSERT(token == "<Dim>");
+  ReadBasicType(is, binary, &dim_); // read dimension.
+  ReadToken(is, binary, &token);
+  if(token == "<DropoutProportion>"){
+    ReadBasicType(is, binary, &dropout_proportion_); // read dropout rate
+  }
+  ReadToken(is, binary, &token);
+  if(token == "<DropoutPerFrame>"){
+    ReadBasicType(is, binary, &dropout_per_frame_); // read dropout mode
+  }
+  ReadToken(is, binary, &token);
+  KALDI_ASSERT(token == "</DropoutComponent>");
 }
 
 void DropoutComponent::Write(std::ostream &os, bool binary) const {
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index b9e86760a46..503eb122f83 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -124,10 +124,9 @@ class DropoutComponent : public RandomComponent {
                                                                 dropout_per_frame_); }
   virtual std::string Info() const;
 
-  void SetDropoutProportion(BaseFloat dropout_proportion, bool dropout_per_frame) {
-     dropout_proportion_ = dropout_proportion;
-     dropout_per_frame_ = dropout_per_frame;
-      }
+  void SetDropoutProportion(BaseFloat dropout_proportion) {
+    dropout_proportion_ = dropout_proportion;
+  }
 
  private:
   int32 dim_;
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 973dea3b913..55e0f8fcf6a 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -524,14 +524,12 @@ std::string NnetInfo(const Nnet &nnet) {
 }
 
 void SetDropoutProportion(BaseFloat dropout_proportion,
-                          bool dropout_per_frame,
                           Nnet *nnet) {
-  dropout_per_frame = false;
   for (int32 c = 0; c < nnet->NumComponents(); c++) {
     Component *comp = nnet->GetComponent(c);
     DropoutComponent *dc = dynamic_cast<DropoutComponent*>(comp);
     if (dc != NULL)
-      dc->SetDropoutProportion(dropout_proportion, dropout_per_frame);
+      dc->SetDropoutProportion(dropout_proportion);
   }
 }
 
@@ -696,14 +694,10 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) {
       // matches names of components, not nodes.
       config_line.GetValue("name", &name_pattern);
       BaseFloat proportion = -1;
-      bool dropout_per_frame = false;
       if (!config_line.GetValue("proportion", &proportion)) {
         KALDI_ERR << "In edits-config, expected proportion to be set in line: "
                   << config_line.WholeLine();
       }
-      if (!config_line.GetValue("dropout-per-frame", &dropout_per_frame)) {
-        dropout_per_frame = false;
-      }
       DropoutComponent *component = NULL;
       int32 num_dropout_proportions_set = 0;
       for (int32 c = 0; c < nnet->NumComponents(); c++) {
@@ -711,7 +705,7 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) {
                                name_pattern.c_str()) &&
             (component =
              dynamic_cast<DropoutComponent*>(nnet->GetComponent(c)))) {
-          component->SetDropoutProportion(proportion, dropout_per_frame);
+          component->SetDropoutProportion(proportion);
           num_dropout_proportions_set++;
         }
       }
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 1d186cc0600..fb304803d1e 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -182,7 +182,7 @@ std::string NnetInfo(const Nnet &nnet);
 
 /// This function sets the dropout proportion in all dropout component to 
 /// dropout_proportion value.
-void SetDropoutProportion(BaseFloat dropout_proportion, bool dropout_per_frame, Nnet *nnet);
+void SetDropoutProportion(BaseFloat dropout_proportion, Nnet *nnet);
 
 /// This function finds a list of components that are never used, and outputs
 /// the integer comopnent indexes (you can use these to index
@@ -233,7 +233,7 @@ void FindOrphanNodes(const Nnet &nnet, std::vector<int32> *nodes);
        remove internal nodes directly; instead you should use the command
        'remove-orphans'.
 
-    set-dropout-proportion [name=<name-pattern>] proportion=<dropout-proportion> dropout-per-frame=<dropout-per-frame>
+    set-dropout-proportion [name=<name-pattern>] proportion=<dropout-proportion>
        Sets the dropout rates for any components of type DropoutComponent whose
        names match the given <name-pattern> (e.g. lstm*).  <name-pattern> defaults to "*".
    \endverbatim
diff --git a/src/nnet3bin/nnet3-combine.cc b/src/nnet3bin/nnet3-combine.cc
index ee6bfffdac4..5abc317f054 100644
--- a/src/nnet3bin/nnet3-combine.cc
+++ b/src/nnet3bin/nnet3-combine.cc
@@ -104,7 +104,7 @@ int main(int argc, char *argv[]) {
     } else {
       KALDI_LOG << "Copying the single input model directly to the output, "
                 << "without any combination.";
-      SetDropoutProportion(0, false, &nnet);
+      SetDropoutProportion(0, &nnet);
       WriteKaldiObject(nnet, nnet_wxfilename, binary_write);
     } 
     KALDI_LOG << "Finished combining neural nets, wrote model to "

From 1e2adab5eac3d0dbfb6a220b2809e968b04d32a9 Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Thu, 22 Dec 2016 00:45:26 +0800
Subject: [PATCH 10/12] fix some bug and format

---
 egs/wsj/s5/steps/libs/nnet3/train/common.py | 2 +-
 src/nnet3/nnet-simple-component.cc          | 6 +++---
 src/nnet3/nnet-simple-component.h           | 4 +++-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 6d255186cf4..9de29017d45 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -401,7 +401,7 @@ def _parse_dropout_string(num_archives_to_process, dropout_str):
             value_x_pair = parts[i].split('@')
             if len(value_x_pair) == 1:
                 # Dropout proportion at half of training
-                dropout_proportion = float(value_x_pair)
+                dropout_proportion = float(value_x_pair[0])
                 num_archives = int(0.5 * num_archives_to_process)
             else:
                 assert len(value_x_pair) == 2
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index e2889415f4c..a94486fe309 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -87,7 +87,8 @@ void PnormComponent::Write(std::ostream &os, bool binary) const {
 }
 
 
-void DropoutComponent::Init(int32 dim, BaseFloat dropout_proportion, bool dropout_per_frame) {
+void DropoutComponent::Init(int32 dim, BaseFloat dropout_proportion,
+                            bool dropout_per_frame) {
   dropout_proportion_ = dropout_proportion;
   dropout_per_frame_ = dropout_per_frame;
   dim_ = dim;
@@ -124,8 +125,7 @@ void DropoutComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
 
   BaseFloat dropout = dropout_proportion_;
   KALDI_ASSERT(dropout >= 0.0 && dropout <= 1.0);
-  if(dropout_per_frame_)
-  {
+  if(dropout_per_frame_) {
     // This const_cast is only safe assuming you don't attempt
     // to use multi-threaded code with the GPU.
     const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index 503eb122f83..7bc74911a4b 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -89,7 +89,9 @@ class DropoutComponent : public RandomComponent {
  public:
   void Init(int32 dim, BaseFloat dropout_proportion = 0.0, bool dropout_per_frame = false);
 
-  DropoutComponent(int32 dim, BaseFloat dropout = 0.0, bool dropout_per_frame = false) { Init(dim, dropout, dropout_per_frame); }
+  DropoutComponent(int32 dim, BaseFloat dropout = 0.0, bool dropout_per_frame = false) {
+    Init(dim, dropout, dropout_per_frame);
+  }
 
   DropoutComponent(): dim_(0), dropout_proportion_(0.0), dropout_per_frame_(false) { }
 

From 463a4dc2a79a731efe0f96ba7ea912133787e8e8 Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Thu, 22 Dec 2016 00:57:46 +0800
Subject: [PATCH 11/12] sublime tool to formate nnet-simple-component.cc

---
 src/nnet3/nnet-simple-component.cc | 959 +++++++++++++++--------------
 1 file changed, 481 insertions(+), 478 deletions(-)

diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index a94486fe309..2c565283b17 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -41,7 +41,7 @@ void PnormComponent::InitFromConfig(ConfigLine *cfl) {
   int32 input_dim = 0;
   int32 output_dim = 0;
   bool ok = cfl->GetValue("output-dim", &output_dim) &&
-      cfl->GetValue("input-dim", &input_dim);
+            cfl->GetValue("input-dim", &input_dim);
   if (!ok || cfl->HasUnusedValues() || output_dim <= 0)
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
@@ -99,11 +99,11 @@ void DropoutComponent::InitFromConfig(ConfigLine *cfl) {
   BaseFloat dropout_proportion = 0.0;
   bool dropout_per_frame = false;
   bool ok = cfl->GetValue("dim", &dim) &&
-    cfl->GetValue("dropout-proportion", &dropout_proportion) &&
-    cfl->GetValue("dropout-per-frame", &dropout_per_frame);
+            cfl->GetValue("dropout-proportion", &dropout_proportion) &&
+            cfl->GetValue("dropout-per-frame", &dropout_per_frame);
   if (!ok || cfl->HasUnusedValues() || dim <= 0 ||
       dropout_proportion < 0.0 || dropout_proportion > 1.0 ||
-     (dropout_per_frame != false and dropout_per_frame != true))
+      (dropout_per_frame != false and dropout_per_frame != true))
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
   Init(dim, dropout_proportion, dropout_per_frame);
@@ -125,14 +125,14 @@ void DropoutComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
 
   BaseFloat dropout = dropout_proportion_;
   KALDI_ASSERT(dropout >= 0.0 && dropout <= 1.0);
-  if(dropout_per_frame_) {
+  if (dropout_per_frame_) {
     // This const_cast is only safe assuming you don't attempt
     // to use multi-threaded code with the GPU.
     const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
 
     out->Add(-dropout); // now, a proportion "dropout" will be <0.0
     out->ApplyHeaviside(); // apply the function (x>0?1:0).  Now, a proportion "dropout" will
-                          // be zero and (1 - dropout) will be 1.0.
+    // be zero and (1 - dropout) will be 1.0.
 
     out->MulElements(in);
   } else {
@@ -173,17 +173,17 @@ void DropoutComponent::Read(std::istream &is, bool binary) {
   //back-compatibility code.
   std::string token;
   ReadToken(is, binary, &token);
-  if(token == "<DropoutComponent>"){
+  if (token == "<DropoutComponent>") {
     ReadToken(is, binary, &token);
   }
   KALDI_ASSERT(token == "<Dim>");
   ReadBasicType(is, binary, &dim_); // read dimension.
   ReadToken(is, binary, &token);
-  if(token == "<DropoutProportion>"){
+  if (token == "<DropoutProportion>") {
     ReadBasicType(is, binary, &dropout_proportion_); // read dropout rate
   }
   ReadToken(is, binary, &token);
-  if(token == "<DropoutPerFrame>"){
+  if (token == "<DropoutPerFrame>") {
     ReadBasicType(is, binary, &dropout_per_frame_); // read dropout mode
   }
   ReadToken(is, binary, &token);
@@ -212,7 +212,7 @@ void SumReduceComponent::InitFromConfig(ConfigLine *cfl) {
   int32 input_dim = 0;
   int32 output_dim = 0;
   bool ok = cfl->GetValue("output-dim", &output_dim) &&
-      cfl->GetValue("input-dim", &input_dim);
+            cfl->GetValue("input-dim", &input_dim);
   if (!ok || cfl->HasUnusedValues() || output_dim <= 0)
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
@@ -285,7 +285,7 @@ void ElementwiseProductComponent::InitFromConfig(ConfigLine *cfl) {
   int32 input_dim = 0;
   int32 output_dim = 0;
   bool ok = cfl->GetValue("output-dim", &output_dim) &&
-      cfl->GetValue("input-dim", &input_dim);
+            cfl->GetValue("input-dim", &input_dim);
   if (!ok || cfl->HasUnusedValues() || output_dim <= 0)
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
@@ -293,9 +293,9 @@ void ElementwiseProductComponent::InitFromConfig(ConfigLine *cfl) {
 }
 
 void ElementwiseProductComponent::Propagate(
-    const ComponentPrecomputedIndexes *indexes,
-    const CuMatrixBase<BaseFloat> &in,
-    CuMatrixBase<BaseFloat> *out) const {
+  const ComponentPrecomputedIndexes *indexes,
+  const CuMatrixBase<BaseFloat> &in,
+  CuMatrixBase<BaseFloat> *out) const {
   KALDI_ASSERT(in.NumCols() == input_dim_);
   int32 num_inputs = input_dim_ / output_dim_;
   for (int32 i = 0; i < num_inputs; i++)  {
@@ -310,12 +310,12 @@ void ElementwiseProductComponent::Propagate(
 }
 
 void ElementwiseProductComponent::Backprop(const std::string &debug_info,
-                              const ComponentPrecomputedIndexes *indexes,
-                              const CuMatrixBase<BaseFloat> &in_value,
-                              const CuMatrixBase<BaseFloat> &out_value,
-                              const CuMatrixBase<BaseFloat> &out_deriv,
-                              Component *to_update,
-                              CuMatrixBase<BaseFloat> *in_deriv) const {
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    Component *to_update,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
   if (!in_deriv)  return;
   int32 num_inputs = input_dim_ / output_dim_;
   for (int32 i = 0; i < num_inputs; i++)  {
@@ -327,9 +327,9 @@ void ElementwiseProductComponent::Backprop(const std::string &debug_info,
       if (i == j)
         continue;
       CuSubMatrix<BaseFloat> in_value_partition(in_value, 0,
-                                                in_value.NumRows(),
-                                                j * output_dim_,
-                                                output_dim_);
+          in_value.NumRows(),
+          j * output_dim_,
+          output_dim_);
       current_in_deriv.MulElements(in_value_partition);
     }
   }
@@ -354,7 +354,7 @@ void ElementwiseProductComponent::Write(std::ostream &os, bool binary) const {
 }
 
 const BaseFloat NormalizeComponent::kSquaredNormFloor =
-    pow(2.0, NormalizeComponent::kExpSquaredNormFloor);
+  pow(2.0, NormalizeComponent::kExpSquaredNormFloor);
 
 // This component modifies the vector of activations by scaling it
 // so that the root-mean-square equals 1.0.  It's important that its
@@ -369,15 +369,15 @@ void NormalizeComponent::Init(int32 input_dim, BaseFloat target_rms,
 }
 
 NormalizeComponent::NormalizeComponent(const NormalizeComponent &other):
-    input_dim_(other.input_dim_), target_rms_(other.target_rms_),
-    add_log_stddev_(other.add_log_stddev_) { }
+  input_dim_(other.input_dim_), target_rms_(other.target_rms_),
+  add_log_stddev_(other.add_log_stddev_) { }
 
 void NormalizeComponent::InitFromConfig(ConfigLine *cfl) {
   int32 input_dim = 0;
   bool add_log_stddev = false;
   BaseFloat target_rms = 1.0;
   bool ok = cfl->GetValue("dim", &input_dim) ||
-      cfl->GetValue("input-dim", &input_dim);
+            cfl->GetValue("input-dim", &input_dim);
   cfl->GetValue("target-rms", &target_rms);
   cfl->GetValue("add-log-stddev", &add_log_stddev);
   if (!ok || cfl->HasUnusedValues() || input_dim <= 0 || target_rms <= 0.0)
@@ -486,8 +486,8 @@ void NormalizeComponent::Backprop(const std::string &debug_info,
                                   CuMatrixBase<BaseFloat> *in_deriv) const {
   if (!in_deriv)  return;
   const CuSubMatrix<BaseFloat> out_deriv_no_log(out_deriv,
-                                                0, out_deriv.NumRows(),
-                                                0, input_dim_);
+      0, out_deriv.NumRows(),
+      0, input_dim_);
   CuVector<BaseFloat> dot_products(out_deriv.NumRows());
   dot_products.AddDiagMatMat(1.0, out_deriv_no_log, kNoTrans,
                              in_value, kTrans, 0.0);
@@ -497,7 +497,7 @@ void NormalizeComponent::Backprop(const std::string &debug_info,
 
   if (add_log_stddev_) {
     CuVector<BaseFloat> log_stddev_deriv(in_norm), // log_stddev deriv as dF/dy .* (x^T x)^-1
-        out_deriv_for_stddev(out_deriv.NumRows(), kUndefined);
+             out_deriv_for_stddev(out_deriv.NumRows(), kUndefined);
     // f = log(sqrt(max(epsi, x^T x / D)))
     // df/dx = epsi^2 * D < x^T x ? (1/(x^T x)) * x  : 0.
     // we don't compute this exactly below for the case wehn x^2 x is very
@@ -550,9 +550,9 @@ void SigmoidComponent::Backprop(const std::string &debug_info,
 }
 
 void SigmoidComponent::RepairGradients(
-    const CuMatrixBase<BaseFloat> &out_value,
-    CuMatrixBase<BaseFloat> *in_deriv,
-    SigmoidComponent *to_update) const {
+  const CuMatrixBase<BaseFloat> &out_value,
+  CuMatrixBase<BaseFloat> *in_deriv,
+  SigmoidComponent *to_update) const {
   KALDI_ASSERT(to_update != NULL);
   // maximum possible derivative of SigmoidComponent is 0.25.
   // the default lower-threshold on the derivative, below which we
@@ -577,7 +577,7 @@ void SigmoidComponent::RepairGradients(
   BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
                                default_lower_threshold :
                                self_repair_lower_threshold_) *
-      count_;
+                              count_;
   if (self_repair_upper_threshold_ != unset) {
     KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid "
               << "components, it does nothing.";
@@ -639,8 +639,8 @@ void SigmoidComponent::StoreStats(const CuMatrixBase<BaseFloat> &out_value) {
 
 
 void NoOpComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
-                                 const CuMatrixBase<BaseFloat> &in,
-                                 CuMatrixBase<BaseFloat> *out) const {
+                              const CuMatrixBase<BaseFloat> &in,
+                              CuMatrixBase<BaseFloat> *out) const {
   out->CopyFromMat(in);
 }
 
@@ -728,7 +728,7 @@ std::string ClipGradientComponent::Info() const {
          << (norm_based_clipping_ ? "true" : "false")
          << ", clipping-threshold=" << clipping_threshold_
          << ", clipped-proportion="
-         << (count_ > 0 ? static_cast<BaseFloat>(num_clipped_)/count_ : 0);
+         << (count_ > 0 ? static_cast<BaseFloat>(num_clipped_) / count_ : 0);
   if (self_repair_scale_ != 0.0)
     stream << ", self-repair-clipped-proportion-threshold="
            << self_repair_clipped_proportion_threshold_
@@ -748,13 +748,13 @@ void ClipGradientComponent::Init(int32 dim,
                                  int32 num_self_repaired,
                                  int32 num_backpropped)  {
   KALDI_ASSERT(clipping_threshold >= 0 && dim > 0 &&
-      self_repair_clipped_proportion_threshold >= 0.0 &&
-      self_repair_target >= 0.0 && self_repair_scale >= 0.0);
+               self_repair_clipped_proportion_threshold >= 0.0 &&
+               self_repair_target >= 0.0 && self_repair_scale >= 0.0);
   dim_ = dim;
   norm_based_clipping_ = norm_based_clipping;
   clipping_threshold_ = clipping_threshold;
   self_repair_clipped_proportion_threshold_ =
-      self_repair_clipped_proportion_threshold;
+    self_repair_clipped_proportion_threshold;
   self_repair_target_ = self_repair_target;
   self_repair_scale_ = self_repair_scale;
   num_clipped_ = num_clipped;
@@ -791,27 +791,27 @@ void ClipGradientComponent::InitFromConfig(ConfigLine *cfl) {
 }
 
 void ClipGradientComponent::Propagate(
-                                 const ComponentPrecomputedIndexes *indexes,
-                                 const CuMatrixBase<BaseFloat> &in,
-                                 CuMatrixBase<BaseFloat> *out) const {
+  const ComponentPrecomputedIndexes *indexes,
+  const CuMatrixBase<BaseFloat> &in,
+  CuMatrixBase<BaseFloat> *out) const {
   out->CopyFromMat(in);
 }
 
 
 void ClipGradientComponent::Backprop(const std::string &debug_info,
-                             const ComponentPrecomputedIndexes *indexes,
-                             const CuMatrixBase<BaseFloat> &in_value,
-                             const CuMatrixBase<BaseFloat> &,
-                             const CuMatrixBase<BaseFloat> &out_deriv,
-                             Component *to_update_in, // may be NULL; may be identical
-                             // to "this" or different.
-                             CuMatrixBase<BaseFloat> *in_deriv) const {
+                                     const ComponentPrecomputedIndexes *indexes,
+                                     const CuMatrixBase<BaseFloat> &in_value,
+                                     const CuMatrixBase<BaseFloat> &,
+                                     const CuMatrixBase<BaseFloat> &out_deriv,
+                                     Component *to_update_in, // may be NULL; may be identical
+                                     // to "this" or different.
+                                     CuMatrixBase<BaseFloat> *in_deriv) const {
   // the following statement will do nothing if in_deriv and out_deriv have same
   // memory.
   in_deriv->CopyFromMat(out_deriv);
 
   ClipGradientComponent *to_update =
-      dynamic_cast<ClipGradientComponent*>(to_update_in);
+    dynamic_cast<ClipGradientComponent*>(to_update_in);
 
   if (clipping_threshold_ > 0) {
     if (norm_based_clipping_) {
@@ -820,11 +820,11 @@ void ClipGradientComponent::Backprop(const std::string &debug_info,
       CuVector<BaseFloat> clipping_scales(in_deriv->NumRows());
       clipping_scales.AddDiagMat2(pow(clipping_threshold_, -2), *in_deriv,
                                   kNoTrans, 0.0);
-     // now clipping_scales contains the squared (norm of each row divided by
-     //  clipping_threshold)
+      // now clipping_scales contains the squared (norm of each row divided by
+      //  clipping_threshold)
       int32 num_not_scaled = clipping_scales.ApplyFloor(1.0);
-     // now clipping_scales contains min(1,
-     //    squared-(norm/clipping_threshold))
+      // now clipping_scales contains min(1,
+      //    squared-(norm/clipping_threshold))
       if (num_not_scaled != clipping_scales.Dim()) {
         clipping_scales.ApplyPow(-0.5);
         // now clipping_scales contains max(1,
@@ -832,7 +832,7 @@ void ClipGradientComponent::Backprop(const std::string &debug_info,
         in_deriv->MulRowsVec(clipping_scales);
         if (to_update != NULL)
           to_update->num_clipped_ += (clipping_scales.Dim() - num_not_scaled);
-       }
+      }
       if (to_update != NULL)
         to_update->count_ += clipping_scales.Dim();
     } else {
@@ -858,9 +858,9 @@ void ClipGradientComponent::Backprop(const std::string &debug_info,
 // comparable to the magnitude of input derivative, especially when the gradient
 // explosion is actually happening.
 void ClipGradientComponent::RepairGradients(
-    const std::string &debug_info,
-    const CuMatrixBase<BaseFloat> &in_value,
-    CuMatrixBase<BaseFloat> *in_deriv, ClipGradientComponent *to_update) const {
+  const std::string &debug_info,
+  const CuMatrixBase<BaseFloat> &in_value,
+  CuMatrixBase<BaseFloat> *in_deriv, ClipGradientComponent *to_update) const {
   KALDI_ASSERT(to_update != NULL);
 
   // we use this 'repair_probability' (hardcoded for now) to limit
@@ -951,7 +951,7 @@ void ClipGradientComponent::Scale(BaseFloat scale) {
 
 void ClipGradientComponent::Add(BaseFloat alpha, const Component &other_in) {
   const ClipGradientComponent *other =
-      dynamic_cast<const ClipGradientComponent*>(&other_in);
+    dynamic_cast<const ClipGradientComponent*>(&other_in);
   KALDI_ASSERT(other != NULL);
   count_ += alpha * other->count_;
   num_clipped_ += alpha * other->num_clipped_;
@@ -968,9 +968,9 @@ void TanhComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
 
 
 void TanhComponent::RepairGradients(
-    const CuMatrixBase<BaseFloat> &out_value,
-    CuMatrixBase<BaseFloat> *in_deriv,
-    TanhComponent *to_update) const {
+  const CuMatrixBase<BaseFloat> &out_value,
+  CuMatrixBase<BaseFloat> *in_deriv,
+  TanhComponent *to_update) const {
   KALDI_ASSERT(to_update != NULL);
   // maximum possible derivative of SigmoidComponent is 1.0
   // the default lower-threshold on the derivative, below which we
@@ -995,7 +995,7 @@ void TanhComponent::RepairGradients(
   BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
                                default_lower_threshold :
                                self_repair_lower_threshold_) *
-      count_;
+                              count_;
   if (self_repair_upper_threshold_ != unset) {
     KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid "
               << "components, it does nothing.";
@@ -1069,27 +1069,27 @@ void TanhComponent::StoreStats(const CuMatrixBase<BaseFloat> &out_value) {
 }
 
 void RectifiedLinearComponent::Propagate(
-    const ComponentPrecomputedIndexes *indexes,
-    const CuMatrixBase<BaseFloat> &in,
-    CuMatrixBase<BaseFloat> *out) const {
+  const ComponentPrecomputedIndexes *indexes,
+  const CuMatrixBase<BaseFloat> &in,
+  CuMatrixBase<BaseFloat> *out) const {
   // Apply rectified linear function (x >= 0 ? 1.0 : 0.0)
   out->CopyFromMat(in);
   out->ApplyFloor(0.0);
 }
 
 void RectifiedLinearComponent::Backprop(
-    const std::string &debug_info,
-    const ComponentPrecomputedIndexes *indexes,
-    const CuMatrixBase<BaseFloat> &, //in_value
-    const CuMatrixBase<BaseFloat> &out_value,
-    const CuMatrixBase<BaseFloat> &out_deriv,
-    Component *to_update_in,
-    CuMatrixBase<BaseFloat> *in_deriv) const {
+  const std::string &debug_info,
+  const ComponentPrecomputedIndexes *indexes,
+  const CuMatrixBase<BaseFloat> &, //in_value
+  const CuMatrixBase<BaseFloat> &out_value,
+  const CuMatrixBase<BaseFloat> &out_deriv,
+  Component *to_update_in,
+  CuMatrixBase<BaseFloat> *in_deriv) const {
   if (in_deriv != NULL) {
     in_deriv->Heaviside(out_value);
     in_deriv->MulElements(out_deriv);
     RectifiedLinearComponent *to_update =
-        dynamic_cast<RectifiedLinearComponent*>(to_update_in);
+      dynamic_cast<RectifiedLinearComponent*>(to_update_in);
     if (to_update != NULL)
       RepairGradients(in_deriv, to_update);
   }
@@ -1097,11 +1097,11 @@ void RectifiedLinearComponent::Backprop(
 
 
 void RectifiedLinearComponent::RepairGradients(
-    CuMatrixBase<BaseFloat> *in_deriv,
-    RectifiedLinearComponent *to_update) const {
+  CuMatrixBase<BaseFloat> *in_deriv,
+  RectifiedLinearComponent *to_update) const {
   KALDI_ASSERT(to_update != NULL);
   BaseFloat default_lower_threshold = 0.05,
-      default_upper_threshold = 0.95;
+            default_upper_threshold = 0.95;
   // we use this 'repair_probability' (hardcoded for now) to limit
   // this code to running on about half of the minibatches.
   BaseFloat repair_probability = 0.5;
@@ -1118,11 +1118,11 @@ void RectifiedLinearComponent::RepairGradients(
   BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
                                default_lower_threshold :
                                self_repair_lower_threshold_) *
-      count_,
-      upper_threshold = (self_repair_upper_threshold_ == unset ?
-                         default_upper_threshold :
-                         self_repair_upper_threshold_) *
-      count_;
+                              count_,
+                              upper_threshold = (self_repair_upper_threshold_ == unset ?
+                                  default_upper_threshold :
+                                  self_repair_upper_threshold_) *
+                                  count_;
 
   CuMatrix<BaseFloat> storage(2, dim_ + 2, kUndefined);
   CuSubVector<BaseFloat> thresholds_vec(storage.RowData(0) + dim_, 2);
@@ -1159,7 +1159,7 @@ void RectifiedLinearComponent::RepairGradients(
 
 
 void RectifiedLinearComponent::StoreStats(
-    const CuMatrixBase<BaseFloat> &out_value) {
+  const CuMatrixBase<BaseFloat> &out_value) {
   // only store stats about every other minibatch.
   if (RandInt(0, 1) == 0)
     return;
@@ -1183,24 +1183,24 @@ void AffineComponent::Resize(int32 input_dim, int32 output_dim) {
 
 void AffineComponent::Add(BaseFloat alpha, const Component &other_in) {
   const AffineComponent *other =
-      dynamic_cast<const AffineComponent*>(&other_in);
+    dynamic_cast<const AffineComponent*>(&other_in);
   KALDI_ASSERT(other != NULL);
   linear_params_.AddMat(alpha, other->linear_params_);
   bias_params_.AddVec(alpha, other->bias_params_);
 }
 
 AffineComponent::AffineComponent(const AffineComponent &component):
-    UpdatableComponent(component),
-    linear_params_(component.linear_params_),
-    bias_params_(component.bias_params_) { }
+  UpdatableComponent(component),
+  linear_params_(component.linear_params_),
+  bias_params_(component.bias_params_) { }
 
 AffineComponent::AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
                                  const CuVectorBase<BaseFloat> &bias_params,
                                  BaseFloat learning_rate):
-    linear_params_(linear_params),
-    bias_params_(bias_params) {
+  linear_params_(linear_params),
+  bias_params_(bias_params) {
   SetUnderlyingLearningRate(learning_rate);
-  KALDI_ASSERT(linear_params.NumRows() == bias_params.Dim()&&
+  KALDI_ASSERT(linear_params.NumRows() == bias_params.Dim() &&
                bias_params.Dim() != 0);
 }
 
@@ -1247,9 +1247,9 @@ Component* AffineComponent::Copy() const {
 
 BaseFloat AffineComponent::DotProduct(const UpdatableComponent &other_in) const {
   const AffineComponent *other =
-      dynamic_cast<const AffineComponent*>(&other_in);
+    dynamic_cast<const AffineComponent*>(&other_in);
   return TraceMatMat(linear_params_, other->linear_params_, kTrans)
-      + VecVec(bias_params_, other->bias_params_);
+         + VecVec(bias_params_, other->bias_params_);
 }
 
 void AffineComponent::Init(int32 input_dim, int32 output_dim,
@@ -1291,7 +1291,7 @@ void AffineComponent::InitFromConfig(ConfigLine *cfl) {
     ok = ok && cfl->GetValue("input-dim", &input_dim);
     ok = ok && cfl->GetValue("output-dim", &output_dim);
     BaseFloat param_stddev = 1.0 / std::sqrt(input_dim),
-        bias_stddev = 1.0;
+              bias_stddev = 1.0;
     cfl->GetValue("param-stddev", &param_stddev);
     cfl->GetValue("bias-stddev", &bias_stddev);
     Init(input_dim, output_dim,
@@ -1309,7 +1309,7 @@ void AffineComponent::InitFromConfig(ConfigLine *cfl) {
 
 void AffineComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
                                 const CuMatrixBase<BaseFloat> &in,
-                                 CuMatrixBase<BaseFloat> *out) const {
+                                CuMatrixBase<BaseFloat> *out) const {
 
   // No need for asserts as they'll happen within the matrix operations.
   out->CopyRowsFromVec(bias_params_); // copies bias_params_ to each row
@@ -1390,7 +1390,7 @@ void AffineComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
 }
 
 Component *AffineComponent::CollapseWithNext(
-    const AffineComponent &next_component) const {
+  const AffineComponent &next_component) const {
   AffineComponent *ans = dynamic_cast<AffineComponent*>(this->Copy());
   KALDI_ASSERT(ans != NULL);
   // Note: it's possible that "ans" is really of a derived type such
@@ -1408,10 +1408,10 @@ Component *AffineComponent::CollapseWithNext(
 }
 
 Component *AffineComponent::CollapseWithNext(
-    const FixedAffineComponent &next_component) const {
+  const FixedAffineComponent &next_component) const {
   // If at least one was non-updatable, make the whole non-updatable.
   FixedAffineComponent *ans =
-      dynamic_cast<FixedAffineComponent*>(next_component.Copy());
+    dynamic_cast<FixedAffineComponent*>(next_component.Copy());
   KALDI_ASSERT(ans != NULL);
   ans->linear_params_.Resize(next_component.OutputDim(), InputDim());
   ans->bias_params_ = next_component.bias_params_;
@@ -1424,10 +1424,10 @@ Component *AffineComponent::CollapseWithNext(
 }
 
 Component *AffineComponent::CollapseWithNext(
-    const FixedScaleComponent &next_component) const {
+  const FixedScaleComponent &next_component) const {
   KALDI_ASSERT(this->OutputDim() == next_component.InputDim());
   AffineComponent *ans =
-      dynamic_cast<AffineComponent*>(this->Copy());
+    dynamic_cast<AffineComponent*>(this->Copy());
   KALDI_ASSERT(ans != NULL);
   ans->linear_params_.MulRowsVec(next_component.scales_);
   ans->bias_params_.MulElements(next_component.scales_);
@@ -1436,10 +1436,10 @@ Component *AffineComponent::CollapseWithNext(
 }
 
 Component *AffineComponent::CollapseWithPrevious(
-    const FixedAffineComponent &prev_component) const {
+  const FixedAffineComponent &prev_component) const {
   // If at least one was non-updatable, make the whole non-updatable.
   FixedAffineComponent *ans =
-      dynamic_cast<FixedAffineComponent*>(prev_component.Copy());
+    dynamic_cast<FixedAffineComponent*>(prev_component.Copy());
   KALDI_ASSERT(ans != NULL);
 
   ans->linear_params_.Resize(this->OutputDim(), prev_component.InputDim());
@@ -1453,10 +1453,10 @@ Component *AffineComponent::CollapseWithPrevious(
 }
 
 RepeatedAffineComponent::RepeatedAffineComponent(const RepeatedAffineComponent & component) :
-    UpdatableComponent(component),
-    linear_params_(component.linear_params_),
-    bias_params_(component.bias_params_),
-    num_repeats_(component.num_repeats_) {}
+  UpdatableComponent(component),
+  linear_params_(component.linear_params_),
+  bias_params_(component.bias_params_),
+  num_repeats_(component.num_repeats_) {}
 
 
 void RepeatedAffineComponent::Scale(BaseFloat scale) {
@@ -1466,7 +1466,7 @@ void RepeatedAffineComponent::Scale(BaseFloat scale) {
 
 void RepeatedAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
   const RepeatedAffineComponent *other =
-      dynamic_cast<const RepeatedAffineComponent *>(&other_in);
+    dynamic_cast<const RepeatedAffineComponent *>(&other_in);
   KALDI_ASSERT(other != NULL);
   linear_params_.AddMat(alpha, other->linear_params_);
   bias_params_.AddVec(alpha, other->bias_params_);
@@ -1481,7 +1481,7 @@ void RepeatedAffineComponent::SetZero(bool treat_as_gradient) {
   bias_params_.SetZero();
 }
 
-void RepeatedAffineComponent::PerturbParams(BaseFloat stddev){
+void RepeatedAffineComponent::PerturbParams(BaseFloat stddev) {
   CuMatrix<BaseFloat> temp_linear_params(linear_params_);
   temp_linear_params.SetRandn();
   linear_params_.AddMat(stddev, temp_linear_params);
@@ -1506,9 +1506,9 @@ Component* RepeatedAffineComponent::Copy() const {
 
 BaseFloat RepeatedAffineComponent::DotProduct(const UpdatableComponent &other_in) const {
   const RepeatedAffineComponent *other =
-      dynamic_cast<const RepeatedAffineComponent*>(&other_in);
+    dynamic_cast<const RepeatedAffineComponent*>(&other_in);
   return TraceMatMat(linear_params_, other->linear_params_, kTrans)
-                     + VecVec(bias_params_, other->bias_params_);
+         + VecVec(bias_params_, other->bias_params_);
 }
 
 void RepeatedAffineComponent::Init(int32 input_dim, int32 output_dim, int32 num_repeats,
@@ -1541,7 +1541,7 @@ void RepeatedAffineComponent::InitFromConfig(ConfigLine *cfl) {
   KALDI_ASSERT(output_dim % num_repeats == 0 &&
                "num-repeats must divide output-dim");
   BaseFloat param_stddev = 1.0 / std::sqrt(input_dim / num_repeats),
-      bias_mean = 0.0, bias_stddev = 0.0;
+            bias_mean = 0.0, bias_stddev = 0.0;
   cfl->GetValue("param-stddev", &param_stddev);
   cfl->GetValue("bias-mean", &bias_mean);
   cfl->GetValue("bias-stddev", &bias_stddev);
@@ -1549,7 +1549,7 @@ void RepeatedAffineComponent::InitFromConfig(ConfigLine *cfl) {
        num_repeats, param_stddev, bias_mean, bias_stddev);
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
-	          << cfl->UnusedValues();
+              << cfl->UnusedValues();
   if (!ok)
     KALDI_ERR << "Bad initializer " << cfl->WholeLine();
 }
@@ -1564,14 +1564,14 @@ void RepeatedAffineComponent::Propagate(const ComponentPrecomputedIndexes *index
                out->NumRows() == in.NumRows());
 
   int32 num_repeats = num_repeats_,
-      num_rows = in.NumRows(),
-      block_dim_out = linear_params_.NumRows(),
-      block_dim_in = linear_params_.NumCols();
+        num_rows = in.NumRows(),
+        block_dim_out = linear_params_.NumRows(),
+        block_dim_in = linear_params_.NumCols();
 
   CuSubMatrix<BaseFloat> in_reshaped(in.Data(), num_rows * num_repeats,
                                      block_dim_in, block_dim_in),
-      out_reshaped(out->Data(), num_rows * num_repeats,
-                   block_dim_out, block_dim_out);
+                                                   out_reshaped(out->Data(), num_rows * num_repeats,
+                                                       block_dim_out, block_dim_out);
 
   out_reshaped.CopyRowsFromVec(bias_params_);
 
@@ -1587,11 +1587,11 @@ void RepeatedAffineComponent::Backprop(const std::string &debug_info,
                                        Component *to_update_in,
                                        CuMatrixBase<BaseFloat> *in_deriv) const {
   KALDI_ASSERT(out_deriv.NumCols() == out_deriv.Stride() &&
-       (in_value.NumCols() == 0 || in_value.NumCols() == in_value.Stride()) &&
+               (in_value.NumCols() == 0 || in_value.NumCols() == in_value.Stride()) &&
                (!in_deriv || in_deriv->NumCols() == in_deriv->Stride()));
 
   RepeatedAffineComponent *to_update = dynamic_cast<RepeatedAffineComponent*>(
-      to_update_in);
+                                         to_update_in);
 
   // Propagate the derivative back to the input.
   // add with coefficient 1.0 since property kBackpropAdds is true.
@@ -1599,16 +1599,16 @@ void RepeatedAffineComponent::Backprop(const std::string &debug_info,
   // in_deriv, in case of infinities.
   if (in_deriv) {
     int32 num_repeats = num_repeats_,
-        num_rows = out_deriv.NumRows(),
-        block_dim_out = linear_params_.NumRows(),
-        block_dim_in = linear_params_.NumCols();
+          num_rows = out_deriv.NumRows(),
+          block_dim_out = linear_params_.NumRows(),
+          block_dim_in = linear_params_.NumCols();
 
     CuSubMatrix<BaseFloat> in_deriv_reshaped(in_deriv->Data(),
-                                             num_rows * num_repeats,
-                                             block_dim_in, block_dim_in),
-        out_deriv_reshaped(out_deriv.Data(),
-                           num_rows * num_repeats,
-                           block_dim_out, block_dim_out);
+        num_rows * num_repeats,
+        block_dim_in, block_dim_in),
+                      out_deriv_reshaped(out_deriv.Data(),
+                                         num_rows * num_repeats,
+                                         block_dim_out, block_dim_out);
     in_deriv_reshaped.AddMatMat(1.0, out_deriv_reshaped, kNoTrans,
                                 linear_params_, kNoTrans, 1.0);
   }
@@ -1626,17 +1626,17 @@ void RepeatedAffineComponent::Update(const CuMatrixBase<BaseFloat> &in_value,
                in_value.NumRows() == out_deriv.NumRows());
 
 
-    int32 num_repeats = num_repeats_,
+  int32 num_repeats = num_repeats_,
         num_rows = in_value.NumRows(),
         block_dim_out = linear_params_.NumRows(),
         block_dim_in = linear_params_.NumCols();
 
-    CuSubMatrix<BaseFloat> in_value_reshaped(in_value.Data(),
-                                             num_rows * num_repeats,
-                                             block_dim_in, block_dim_in),
-        out_deriv_reshaped(out_deriv.Data(),
-                           num_rows * num_repeats,
-                           block_dim_out, block_dim_out);
+  CuSubMatrix<BaseFloat> in_value_reshaped(in_value.Data(),
+      num_rows * num_repeats,
+      block_dim_in, block_dim_in),
+                    out_deriv_reshaped(out_deriv.Data(),
+                                       num_rows * num_repeats,
+                                       block_dim_out, block_dim_out);
 
 
   linear_params_.AddMatMat(learning_rate_, out_deriv_reshaped, kTrans,
@@ -1707,9 +1707,9 @@ void NaturalGradientRepeatedAffineComponent::SetNaturalGradientConfigs() {
 }
 
 NaturalGradientRepeatedAffineComponent::NaturalGradientRepeatedAffineComponent(
-    const NaturalGradientRepeatedAffineComponent &other):
-    RepeatedAffineComponent(other),
-    preconditioner_in_(other.preconditioner_in_) { }
+  const NaturalGradientRepeatedAffineComponent &other):
+  RepeatedAffineComponent(other),
+  preconditioner_in_(other.preconditioner_in_) { }
 
 // virtual
 Component* NaturalGradientRepeatedAffineComponent::Copy() const {
@@ -1717,23 +1717,23 @@ Component* NaturalGradientRepeatedAffineComponent::Copy() const {
 }
 
 void NaturalGradientRepeatedAffineComponent::Update(
-    const CuMatrixBase<BaseFloat> &in_value,
-    const CuMatrixBase<BaseFloat> &out_deriv) {
+  const CuMatrixBase<BaseFloat> &in_value,
+  const CuMatrixBase<BaseFloat> &out_deriv) {
   KALDI_ASSERT(out_deriv.NumCols() == out_deriv.Stride() &&
                in_value.NumCols() == in_value.Stride() &&
                in_value.NumRows() == out_deriv.NumRows());
 
   int32 num_repeats = num_repeats_,
-      num_rows = in_value.NumRows(),
-      block_dim_out = linear_params_.NumRows(),
-      block_dim_in = linear_params_.NumCols();
+        num_rows = in_value.NumRows(),
+        block_dim_out = linear_params_.NumRows(),
+        block_dim_in = linear_params_.NumCols();
 
   CuSubMatrix<BaseFloat> in_value_reshaped(in_value.Data(),
-                                           num_rows * num_repeats,
-                                           block_dim_in, block_dim_in),
-        out_deriv_reshaped(out_deriv.Data(),
-                           num_rows * num_repeats,
-                           block_dim_out, block_dim_out);
+      num_rows * num_repeats,
+      block_dim_in, block_dim_in),
+                    out_deriv_reshaped(out_deriv.Data(),
+                                       num_rows * num_repeats,
+                                       block_dim_out, block_dim_out);
 
   CuVector<BaseFloat> bias_deriv(block_dim_out);
   bias_deriv.AddRowSumMat(1.0, out_deriv_reshaped);
@@ -1741,8 +1741,8 @@ void NaturalGradientRepeatedAffineComponent::Update(
   CuMatrix<BaseFloat> deriv(block_dim_out,
                             block_dim_in + 1);
   deriv.ColRange(0, block_dim_in).AddMatMat(
-      1.0, out_deriv_reshaped, kTrans,
-      in_value_reshaped, kNoTrans, 1.0);
+    1.0, out_deriv_reshaped, kTrans,
+    in_value_reshaped, kNoTrans, 1.0);
   deriv.CopyColFromVec(bias_deriv, block_dim_in);
 
   BaseFloat scale = 1.0;
@@ -1782,13 +1782,13 @@ BlockAffineComponent::BlockAffineComponent(const RepeatedAffineComponent &rac) :
   num_blocks_(rac.num_repeats_) {
   // copy rac's linear_params_ and bias_params_ to this.
   int32 num_rows_in_block = rac.linear_params_.NumRows();
-  for(int32 block_counter = 0; block_counter < num_blocks_; block_counter++) {
+  for (int32 block_counter = 0; block_counter < num_blocks_; block_counter++) {
     int32 row_offset = block_counter * num_rows_in_block;
     CuSubMatrix<BaseFloat> block = this->linear_params_.RowRange(row_offset,
-                                                                 num_rows_in_block);
+                                   num_rows_in_block);
     block.CopyFromMat(rac.linear_params_);
     CuSubVector<BaseFloat> block_bias = this->bias_params_.Range(row_offset,
-                                                                 num_rows_in_block);
+                                        num_rows_in_block);
     block_bias.CopyFromVec(rac.bias_params_);
   }
 }
@@ -1827,14 +1827,14 @@ void BlockAffineComponent::Init(int32 input_dim,
 
 void BlockAffineComponent::InitFromConfig(ConfigLine *cfl) {
   int32 input_dim = -1, output_dim = -1, num_blocks = -1;
-  if(!cfl->GetValue("input-dim", &input_dim) ||
-     !cfl->GetValue("output-dim", &output_dim) ||
-     !cfl->GetValue("num-blocks", &num_blocks))
+  if (!cfl->GetValue("input-dim", &input_dim) ||
+      !cfl->GetValue("output-dim", &output_dim) ||
+      !cfl->GetValue("num-blocks", &num_blocks))
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
   InitLearningRatesFromConfig(cfl);
   BaseFloat param_stddev = 1.0 / std::sqrt(input_dim / num_blocks),
-      bias_mean = 0.0, bias_stddev = 1.0;
+            bias_mean = 0.0, bias_stddev = 1.0;
   cfl->GetValue("param-stddev", &param_stddev);
   cfl->GetValue("bias-stddev", &bias_stddev);
   cfl->GetValue("bias-mean", &bias_mean);
@@ -1856,21 +1856,21 @@ void BlockAffineComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
   int32 num_rows_in_block = linear_params_.NumRows() / num_blocks_;
   int32 num_cols_in_block = linear_params_.NumCols();
   std::vector<CuSubMatrix<BaseFloat> *> in_batch, out_batch,
-    linear_params_batch;
-  for(int block_counter = 0; block_counter < num_blocks_; block_counter++) {
+      linear_params_batch;
+  for (int block_counter = 0; block_counter < num_blocks_; block_counter++) {
     CuSubMatrix<BaseFloat> *in_block =
       new CuSubMatrix<BaseFloat>(in.ColRange(block_counter * num_cols_in_block,
-                                   num_cols_in_block));
+                                 num_cols_in_block));
     in_batch.push_back(in_block);
 
     CuSubMatrix<BaseFloat> *out_block =
       new CuSubMatrix<BaseFloat>(out->ColRange(block_counter * num_rows_in_block,
-                                    num_rows_in_block));
+                                 num_rows_in_block));
     out_batch.push_back(out_block);
 
     CuSubMatrix<BaseFloat> *linear_params_block =
       new CuSubMatrix<BaseFloat>(linear_params_.RowRange(block_counter * num_rows_in_block,
-                                              num_rows_in_block));
+                                 num_rows_in_block));
     linear_params_batch.push_back(linear_params_block);
   }
   AddMatMatBatched<BaseFloat>(1.0, out_batch, in_batch, kNoTrans,
@@ -1900,20 +1900,20 @@ void BlockAffineComponent::Backprop(const std::string &debug_info,
   if (in_deriv) {
     std::vector<CuSubMatrix<BaseFloat> *> in_deriv_batch, out_deriv_batch, linear_params_batch;
 
-    for(int block_counter = 0; block_counter < num_blocks_; block_counter++) {
+    for (int block_counter = 0; block_counter < num_blocks_; block_counter++) {
       CuSubMatrix<BaseFloat> *in_deriv_block =
         new CuSubMatrix<BaseFloat>(in_deriv->ColRange(block_counter * num_cols_in_block,
-                                                      num_cols_in_block));
+                                   num_cols_in_block));
       in_deriv_batch.push_back(in_deriv_block);
 
       CuSubMatrix<BaseFloat> *out_deriv_block =
         new CuSubMatrix<BaseFloat>(out_deriv.ColRange(block_counter * num_rows_in_block,
-                                                       num_rows_in_block));
+                                   num_rows_in_block));
       out_deriv_batch.push_back(out_deriv_block);
 
       CuSubMatrix<BaseFloat> *linear_params_block =
         new CuSubMatrix<BaseFloat>(linear_params_.RowRange(block_counter * num_rows_in_block,
-                                                          num_rows_in_block));
+                                   num_rows_in_block));
       linear_params_batch.push_back(linear_params_block);
     }
 
@@ -1927,25 +1927,26 @@ void BlockAffineComponent::Backprop(const std::string &debug_info,
 
   if (to_update != NULL) {
 
-    { // linear params update
+    {
+      // linear params update
 
       std::vector<CuSubMatrix<BaseFloat> *> in_value_batch,
-        out_deriv_batch, linear_params_batch;
+          out_deriv_batch, linear_params_batch;
 
       for (int block_counter = 0; block_counter < num_blocks_; block_counter++) {
         CuSubMatrix<BaseFloat> *in_value_block =
           new CuSubMatrix<BaseFloat>(in_value.ColRange(block_counter * num_cols_in_block,
-                                                       num_cols_in_block));
+                                     num_cols_in_block));
         in_value_batch.push_back(in_value_block);
 
         CuSubMatrix<BaseFloat> *out_deriv_block =
           new CuSubMatrix<BaseFloat>(out_deriv.ColRange(block_counter * num_rows_in_block,
-                                                        num_rows_in_block));
+                                     num_rows_in_block));
         out_deriv_batch.push_back(out_deriv_block);
 
         CuSubMatrix<BaseFloat> *linear_params_block =
           new CuSubMatrix<BaseFloat>(to_update->linear_params_.RowRange(block_counter * num_rows_in_block,
-                                                                        num_rows_in_block));
+                                     num_rows_in_block));
         linear_params_batch.push_back(linear_params_block);
       }
 
@@ -1959,7 +1960,8 @@ void BlockAffineComponent::Backprop(const std::string &debug_info,
       DeletePointers(&linear_params_batch);
     } // end linear params update
 
-    { // bias update
+    {
+      // bias update
       to_update->bias_params_.AddRowSumMat(to_update->learning_rate_,
                                            out_deriv, 1.0);
     } // end bias update
@@ -2002,7 +2004,7 @@ BaseFloat BlockAffineComponent::DotProduct(const UpdatableComponent &other_in) c
   const BlockAffineComponent *other =
     dynamic_cast<const BlockAffineComponent*>(&other_in);
   return TraceMatMat(linear_params_, other->linear_params_, kTrans) +
-    VecVec(bias_params_, other->bias_params_);
+         VecVec(bias_params_, other->bias_params_);
 }
 
 void BlockAffineComponent::Read(std::istream &is, bool binary) {
@@ -2058,15 +2060,15 @@ void PerElementScaleComponent::Scale(BaseFloat scale) {
 void PerElementScaleComponent::Add(BaseFloat alpha,
                                    const Component &other_in) {
   const PerElementScaleComponent *other =
-      dynamic_cast<const PerElementScaleComponent*>(&other_in);
+    dynamic_cast<const PerElementScaleComponent*>(&other_in);
   KALDI_ASSERT(other != NULL);
   scales_.AddVec(alpha, other->scales_);
 }
 
 PerElementScaleComponent::PerElementScaleComponent(
-    const PerElementScaleComponent &component):
-    UpdatableComponent(component),
-    scales_(component.scales_) { }
+  const PerElementScaleComponent &component):
+  UpdatableComponent(component),
+  scales_(component.scales_) { }
 
 void PerElementScaleComponent::SetZero(bool treat_as_gradient) {
   if (treat_as_gradient) {
@@ -2096,9 +2098,9 @@ Component* PerElementScaleComponent::Copy() const {
 }
 
 BaseFloat PerElementScaleComponent::DotProduct(
-    const UpdatableComponent &other_in) const {
+  const UpdatableComponent &other_in) const {
   const PerElementScaleComponent *other =
-      dynamic_cast<const PerElementScaleComponent*>(&other_in);
+    dynamic_cast<const PerElementScaleComponent*>(&other_in);
   return VecVec(scales_, other->scales_);
 }
 
@@ -2129,7 +2131,7 @@ void PerElementScaleComponent::InitFromConfig(ConfigLine *cfl) {
       KALDI_ASSERT(dim == InputDim() &&
                    "input-dim mismatch vs. vector.");
   } else {
-    if(!cfl->GetValue("dim", &dim))
+    if (!cfl->GetValue("dim", &dim))
       KALDI_ERR << "'dim' not provided in the config line.";
     BaseFloat param_mean = 1.0, param_stddev = 0.0;
     cfl->GetValue("param-mean", &param_mean);
@@ -2142,30 +2144,30 @@ void PerElementScaleComponent::InitFromConfig(ConfigLine *cfl) {
 }
 
 void PerElementScaleComponent::Propagate(
-    const ComponentPrecomputedIndexes *indexes,
-    const CuMatrixBase<BaseFloat> &in,
-    CuMatrixBase<BaseFloat> *out) const {
+  const ComponentPrecomputedIndexes *indexes,
+  const CuMatrixBase<BaseFloat> &in,
+  CuMatrixBase<BaseFloat> *out) const {
   out->CopyFromMat(in);
   out->MulColsVec(scales_);
 }
 
 void PerElementScaleComponent::UpdateSimple(
-    const CuMatrixBase<BaseFloat> &in_value,
-    const CuMatrixBase<BaseFloat> &out_deriv) {
+  const CuMatrixBase<BaseFloat> &in_value,
+  const CuMatrixBase<BaseFloat> &out_deriv) {
   scales_.AddDiagMatMat(learning_rate_, out_deriv, kTrans,
                         in_value, kNoTrans, 1.0);
 }
 
 void PerElementScaleComponent::Backprop(
-    const std::string &debug_info,
-    const ComponentPrecomputedIndexes *indexes,
-    const CuMatrixBase<BaseFloat> &in_value,
-    const CuMatrixBase<BaseFloat> &, // out_value
-    const CuMatrixBase<BaseFloat> &out_deriv,
-    Component *to_update_in,
-    CuMatrixBase<BaseFloat> *in_deriv) const {
+  const std::string &debug_info,
+  const ComponentPrecomputedIndexes *indexes,
+  const CuMatrixBase<BaseFloat> &in_value,
+  const CuMatrixBase<BaseFloat> &, // out_value
+  const CuMatrixBase<BaseFloat> &out_deriv,
+  Component *to_update_in,
+  CuMatrixBase<BaseFloat> *in_deriv) const {
   PerElementScaleComponent *to_update =
-      dynamic_cast<PerElementScaleComponent*>(to_update_in);
+    dynamic_cast<PerElementScaleComponent*>(to_update_in);
 
   if (in_deriv) {
     // Propagate the derivative back to the input.
@@ -2210,7 +2212,7 @@ void PerElementScaleComponent::Vectorize(VectorBase<BaseFloat> *params) const {
 }
 
 void PerElementScaleComponent::UnVectorize(
-    const VectorBase<BaseFloat> &params) {
+  const VectorBase<BaseFloat> &params) {
   scales_.CopyFromVec(params);
 }
 
@@ -2220,17 +2222,17 @@ void PerElementOffsetComponent::Scale(BaseFloat scale) {
 
 
 void PerElementOffsetComponent::Add(BaseFloat alpha,
-                                   const Component &other_in) {
+                                    const Component &other_in) {
   const PerElementOffsetComponent *other =
-      dynamic_cast<const PerElementOffsetComponent*>(&other_in);
+    dynamic_cast<const PerElementOffsetComponent*>(&other_in);
   KALDI_ASSERT(other != NULL);
   offsets_.AddVec(alpha, other->offsets_);
 }
 
 PerElementOffsetComponent::PerElementOffsetComponent(
-    const PerElementOffsetComponent &component):
-    UpdatableComponent(component),
-    offsets_(component.offsets_) { }
+  const PerElementOffsetComponent &component):
+  UpdatableComponent(component),
+  offsets_(component.offsets_) { }
 
 void PerElementOffsetComponent::SetZero(bool treat_as_gradient) {
   if (treat_as_gradient) {
@@ -2260,9 +2262,9 @@ Component* PerElementOffsetComponent::Copy() const {
 }
 
 BaseFloat PerElementOffsetComponent::DotProduct(
-    const UpdatableComponent &other_in) const {
+  const UpdatableComponent &other_in) const {
   const PerElementOffsetComponent *other =
-      dynamic_cast<const PerElementOffsetComponent*>(&other_in);
+    dynamic_cast<const PerElementOffsetComponent*>(&other_in);
   return VecVec(offsets_, other->offsets_);
 }
 
@@ -2293,7 +2295,7 @@ void PerElementOffsetComponent::InitFromConfig(ConfigLine *cfl) {
       KALDI_ASSERT(dim == InputDim() &&
                    "input-dim mismatch vs. vector.");
   } else {
-    if(!cfl->GetValue("dim", &dim))
+    if (!cfl->GetValue("dim", &dim))
       KALDI_ERR << "'dim' not provided in the config line.";
     BaseFloat param_mean = 0.0, param_stddev = 0.0;
     cfl->GetValue("param-mean", &param_mean);
@@ -2306,23 +2308,23 @@ void PerElementOffsetComponent::InitFromConfig(ConfigLine *cfl) {
 }
 
 void PerElementOffsetComponent::Propagate(
-    const ComponentPrecomputedIndexes *indexes,
-    const CuMatrixBase<BaseFloat> &in,
-    CuMatrixBase<BaseFloat> *out) const {
+  const ComponentPrecomputedIndexes *indexes,
+  const CuMatrixBase<BaseFloat> &in,
+  CuMatrixBase<BaseFloat> *out) const {
   out->CopyFromMat(in);
   out->AddVecToRows(1.0, offsets_);
 }
 
 void PerElementOffsetComponent::Backprop(
-    const std::string &debug_info,
-    const ComponentPrecomputedIndexes *indexes,
-    const CuMatrixBase<BaseFloat> &, // in_value
-    const CuMatrixBase<BaseFloat> &, // out_value
-    const CuMatrixBase<BaseFloat> &out_deriv,
-    Component *to_update_in,
-    CuMatrixBase<BaseFloat> *in_deriv) const {
+  const std::string &debug_info,
+  const ComponentPrecomputedIndexes *indexes,
+  const CuMatrixBase<BaseFloat> &, // in_value
+  const CuMatrixBase<BaseFloat> &, // out_value
+  const CuMatrixBase<BaseFloat> &out_deriv,
+  Component *to_update_in,
+  CuMatrixBase<BaseFloat> *in_deriv) const {
   PerElementOffsetComponent *to_update =
-      dynamic_cast<PerElementOffsetComponent*>(to_update_in);
+    dynamic_cast<PerElementOffsetComponent*>(to_update_in);
 
   if (in_deriv) {
     // Propagate the derivative back to the input.
@@ -2360,7 +2362,7 @@ void PerElementOffsetComponent::Vectorize(VectorBase<BaseFloat> *params) const {
 }
 
 void PerElementOffsetComponent::UnVectorize(
-    const VectorBase<BaseFloat> &params) {
+  const VectorBase<BaseFloat> &params) {
   offsets_.CopyFromVec(params);
 }
 
@@ -2377,30 +2379,30 @@ std::string ConstantFunctionComponent::Info() const {
 }
 
 ConstantFunctionComponent::ConstantFunctionComponent():
-    input_dim_(-1), is_updatable_(true), use_natural_gradient_(true) { }
+  input_dim_(-1), is_updatable_(true), use_natural_gradient_(true) { }
 
 ConstantFunctionComponent::ConstantFunctionComponent(
-    const ConstantFunctionComponent &other):
-    input_dim_(other.input_dim_), output_(other.output_),
-    is_updatable_(other.is_updatable_),
-    use_natural_gradient_(other.use_natural_gradient_),
-    preconditioner_(other.preconditioner_) { }
+  const ConstantFunctionComponent &other):
+  input_dim_(other.input_dim_), output_(other.output_),
+  is_updatable_(other.is_updatable_),
+  use_natural_gradient_(other.use_natural_gradient_),
+  preconditioner_(other.preconditioner_) { }
 
 void ConstantFunctionComponent::Propagate(
-    const ComponentPrecomputedIndexes *indexes,
-    const CuMatrixBase<BaseFloat> &in,
-    CuMatrixBase<BaseFloat> *out) const {
+  const ComponentPrecomputedIndexes *indexes,
+  const CuMatrixBase<BaseFloat> &in,
+  CuMatrixBase<BaseFloat> *out) const {
   out->CopyRowsFromVec(output_);
 }
 
 void ConstantFunctionComponent::Backprop(
-    const std::string &debug_info,
-    const ComponentPrecomputedIndexes *indexes,
-    const CuMatrixBase<BaseFloat> &, // in_value
-    const CuMatrixBase<BaseFloat> &, // out_value
-    const CuMatrixBase<BaseFloat> &out_deriv,
-    Component *to_update_in,
-    CuMatrixBase<BaseFloat> *in_deriv) const {
+  const std::string &debug_info,
+  const ComponentPrecomputedIndexes *indexes,
+  const CuMatrixBase<BaseFloat> &, // in_value
+  const CuMatrixBase<BaseFloat> &, // out_value
+  const CuMatrixBase<BaseFloat> &out_deriv,
+  Component *to_update_in,
+  CuMatrixBase<BaseFloat> *in_deriv) const {
   // we don't update in_deriv, since we set the flag
   // kBackpropAdds, and the output doesn't depend on the
   // input, so the input-derivative is zero.
@@ -2414,7 +2416,7 @@ void ConstantFunctionComponent::Backprop(
         CuMatrix<BaseFloat> out_deriv_copy(out_deriv);
         BaseFloat scale = 1.0;
         to_update->preconditioner_.PreconditionDirections(&out_deriv_copy,
-                                                          NULL, &scale);
+            NULL, &scale);
         to_update->output_.AddRowSumMat(scale * to_update->learning_rate_,
                                         out_deriv_copy);
       } else {
@@ -2489,7 +2491,7 @@ void ConstantFunctionComponent::Scale(BaseFloat scale) {
 void ConstantFunctionComponent::Add(BaseFloat alpha, const Component &other_in) {
   if (is_updatable_) {
     const ConstantFunctionComponent *other =
-        dynamic_cast<const ConstantFunctionComponent*>(&other_in);
+      dynamic_cast<const ConstantFunctionComponent*>(&other_in);
     KALDI_ASSERT(other != NULL);
     output_.AddVec(alpha, other->output_);
   }
@@ -2510,10 +2512,10 @@ void ConstantFunctionComponent::PerturbParams(BaseFloat stddev) {
 }
 
 BaseFloat ConstantFunctionComponent::DotProduct(
-    const UpdatableComponent &other_in) const {
+  const UpdatableComponent &other_in) const {
   KALDI_ASSERT(is_updatable_);
   const ConstantFunctionComponent *other =
-      dynamic_cast<const ConstantFunctionComponent*>(&other_in);
+    dynamic_cast<const ConstantFunctionComponent*>(&other_in);
   KALDI_ASSERT(other != NULL);
   return VecVec(output_, other->output_);
 }
@@ -2522,7 +2524,7 @@ void ConstantFunctionComponent::InitFromConfig(ConfigLine *cfl) {
   int32 output_dim = 0;
   InitLearningRatesFromConfig(cfl);
   bool ok = cfl->GetValue("output-dim", &output_dim) &&
-      cfl->GetValue("input-dim", &input_dim_);
+            cfl->GetValue("input-dim", &input_dim_);
   cfl->GetValue("is-updatable", &is_updatable_);
   cfl->GetValue("use-natural-gradient", &use_natural_gradient_);
   BaseFloat output_mean = 0.0, output_stddev = 0.0;
@@ -2554,13 +2556,13 @@ void ConstantFunctionComponent::UnVectorize(const VectorBase<BaseFloat> &params)
 
 
 NaturalGradientAffineComponent::NaturalGradientAffineComponent():
-    max_change_per_sample_(0.0),
-    update_count_(0.0), active_scaling_count_(0.0),
-    max_change_scale_stats_(0.0) { }
+  max_change_per_sample_(0.0),
+  update_count_(0.0), active_scaling_count_(0.0),
+  max_change_scale_stats_(0.0) { }
 
 // virtual
 void NaturalGradientAffineComponent::Resize(
-    int32 input_dim, int32 output_dim) {
+  int32 input_dim, int32 output_dim) {
   KALDI_ASSERT(input_dim > 1 && output_dim > 1);
   if (rank_in_ >= input_dim) rank_in_ = input_dim - 1;
   if (rank_out_ >= output_dim) rank_out_ = output_dim - 1;
@@ -2614,9 +2616,9 @@ void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) {
   bool ok = true;
   std::string matrix_filename;
   BaseFloat num_samples_history = 2000.0, alpha = 4.0,
-      max_change_per_sample = 0.0;
+            max_change_per_sample = 0.0;
   int32 input_dim = -1, output_dim = -1, rank_in = 20, rank_out = 80,
-      update_period = 4;
+        update_period = 4;
   InitLearningRatesFromConfig(cfl);
   cfl->GetValue("num-samples-history", &num_samples_history);
   cfl->GetValue("alpha", &alpha);
@@ -2641,7 +2643,7 @@ void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) {
     if (!ok)
       KALDI_ERR << "Bad initializer " << cfl->WholeLine();
     BaseFloat param_stddev = 1.0 / std::sqrt(input_dim),
-        bias_stddev = 1.0, bias_mean = 0.0;
+              bias_stddev = 1.0, bias_mean = 0.0;
     cfl->GetValue("param-stddev", &param_stddev);
     cfl->GetValue("bias-stddev", &bias_stddev);
     cfl->GetValue("bias-mean", &bias_mean);
@@ -2668,10 +2670,10 @@ void NaturalGradientAffineComponent::SetNaturalGradientConfigs() {
 }
 
 void NaturalGradientAffineComponent::Init(
-    int32 rank_in, int32 rank_out,
-    int32 update_period, BaseFloat num_samples_history, BaseFloat alpha,
-    BaseFloat max_change_per_sample,
-    std::string matrix_filename) {
+  int32 rank_in, int32 rank_out,
+  int32 update_period, BaseFloat num_samples_history, BaseFloat alpha,
+  BaseFloat max_change_per_sample,
+  std::string matrix_filename) {
   rank_in_ = rank_in;
   rank_out_ = rank_out;
   update_period_ = update_period;
@@ -2695,11 +2697,11 @@ void NaturalGradientAffineComponent::Init(
 }
 
 void NaturalGradientAffineComponent::Init(
-    int32 input_dim, int32 output_dim,
-    BaseFloat param_stddev, BaseFloat bias_stddev, BaseFloat bias_mean,
-    int32 rank_in, int32 rank_out, int32 update_period,
-    BaseFloat num_samples_history, BaseFloat alpha,
-    BaseFloat max_change_per_sample) {
+  int32 input_dim, int32 output_dim,
+  BaseFloat param_stddev, BaseFloat bias_stddev, BaseFloat bias_mean,
+  int32 rank_in, int32 rank_out, int32 update_period,
+  BaseFloat num_samples_history, BaseFloat alpha,
+  BaseFloat max_change_per_sample) {
   linear_params_.Resize(output_dim, input_dim);
   bias_params_.Resize(output_dim);
   KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0 &&
@@ -2729,7 +2731,7 @@ void NaturalGradientAffineComponent::Init(
 }
 
 void NaturalGradientAffineComponent::Write(std::ostream &os,
-                                           bool binary) const {
+    bool binary) const {
   WriteUpdatableCommon(os, binary);  // Write the opening tag and learning rate
   WriteToken(os, binary, "<LinearParams>");
   linear_params_.Write(os, binary);
@@ -2782,26 +2784,26 @@ Component* NaturalGradientAffineComponent::Copy() const {
 }
 
 NaturalGradientAffineComponent::NaturalGradientAffineComponent(
-    const NaturalGradientAffineComponent &other):
-    AffineComponent(other),
-    rank_in_(other.rank_in_),
-    rank_out_(other.rank_out_),
-    update_period_(other.update_period_),
-    num_samples_history_(other.num_samples_history_),
-    alpha_(other.alpha_),
-    preconditioner_in_(other.preconditioner_in_),
-    preconditioner_out_(other.preconditioner_out_),
-    max_change_per_sample_(other.max_change_per_sample_),
-    update_count_(other.update_count_),
-    active_scaling_count_(other.active_scaling_count_),
-    max_change_scale_stats_(other.max_change_scale_stats_) {
+  const NaturalGradientAffineComponent &other):
+  AffineComponent(other),
+  rank_in_(other.rank_in_),
+  rank_out_(other.rank_out_),
+  update_period_(other.update_period_),
+  num_samples_history_(other.num_samples_history_),
+  alpha_(other.alpha_),
+  preconditioner_in_(other.preconditioner_in_),
+  preconditioner_out_(other.preconditioner_out_),
+  max_change_per_sample_(other.max_change_per_sample_),
+  update_count_(other.update_count_),
+  active_scaling_count_(other.active_scaling_count_),
+  max_change_scale_stats_(other.max_change_scale_stats_) {
   SetNaturalGradientConfigs();
 }
 
 void NaturalGradientAffineComponent::Update(
-    const std::string &debug_info,
-    const CuMatrixBase<BaseFloat> &in_value,
-    const CuMatrixBase<BaseFloat> &out_deriv) {
+  const std::string &debug_info,
+  const CuMatrixBase<BaseFloat> &in_value,
+  const CuMatrixBase<BaseFloat> &out_deriv) {
   CuMatrix<BaseFloat> in_value_temp;
 
   in_value_temp.Resize(in_value.NumRows(),
@@ -2818,16 +2820,16 @@ void NaturalGradientAffineComponent::Update(
   CuMatrix<BaseFloat> row_products(2,
                                    in_value.NumRows());
   CuSubVector<BaseFloat> in_row_products(row_products, 0),
-      out_row_products(row_products, 1);
+              out_row_products(row_products, 1);
 
   // These "scale" values get will get multiplied into the learning rate (faster
   // than having the matrices scaled inside the preconditioning code).
   BaseFloat in_scale, out_scale;
 
   preconditioner_in_.PreconditionDirections(&in_value_temp, &in_row_products,
-                                            &in_scale);
+      &in_scale);
   preconditioner_out_.PreconditionDirections(&out_deriv_temp, &out_row_products,
-                                             &out_scale);
+      &out_scale);
 
   // "scale" is a scaling factor coming from the PreconditionDirections calls
   // (it's faster to have them output a scaling factor than to have them scale
@@ -2835,8 +2837,8 @@ void NaturalGradientAffineComponent::Update(
   BaseFloat scale = in_scale * out_scale;
 
   CuSubMatrix<BaseFloat> in_value_precon_part(in_value_temp,
-                                              0, in_value_temp.NumRows(),
-                                              0, in_value_temp.NumCols() - 1);
+      0, in_value_temp.NumRows(),
+      0, in_value_temp.NumCols() - 1);
   // this "precon_ones" is what happens to the vector of 1's representing
   // offsets, after multiplication by the preconditioner.
   CuVector<BaseFloat> precon_ones(in_value_temp.NumRows());
@@ -2867,7 +2869,7 @@ void NaturalGradientAffineComponent::Scale(BaseFloat scale) {
 
 void NaturalGradientAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
   const NaturalGradientAffineComponent *other =
-      dynamic_cast<const NaturalGradientAffineComponent*>(&other_in);
+    dynamic_cast<const NaturalGradientAffineComponent*>(&other_in);
   KALDI_ASSERT(other != NULL);
   update_count_ += alpha * other->update_count_;
   max_change_scale_stats_ += alpha * other->max_change_scale_stats_;
@@ -2921,8 +2923,8 @@ void FixedAffineComponent::InitFromConfig(ConfigLine *cfl) {
 
 
 FixedAffineComponent::FixedAffineComponent(const AffineComponent &c):
-    linear_params_(c.LinearParams()),
-    bias_params_(c.BiasParams()) { }
+  linear_params_(c.LinearParams()),
+  bias_params_(c.BiasParams()) { }
 
 void FixedAffineComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
                                      const CuMatrixBase<BaseFloat> &in,
@@ -3058,7 +3060,7 @@ void SumGroupComponent::GetSizes(std::vector<int32> *sizes) const {
   for (size_t i = 0; i < indexes.size(); i++) {
     (*sizes)[i] = indexes[i].second - indexes[i].first;
     if (i == 0) { KALDI_ASSERT(indexes[i].first == 0); }
-    else { KALDI_ASSERT(indexes[i].first == indexes[i-1].second); }
+    else { KALDI_ASSERT(indexes[i].first == indexes[i - 1].second); }
     KALDI_ASSERT(indexes[i].second > indexes[i].first);
     (*sizes)[i] = indexes[i].second - indexes[i].first;
   }
@@ -3301,7 +3303,7 @@ void FixedBiasComponent::Read(std::istream &is, bool binary) {
 
 
 void NaturalGradientPerElementScaleComponent::Read(
-    std::istream &is, bool binary) {
+  std::istream &is, bool binary) {
   ReadUpdatableCommon(is, binary);  // Read the opening tag and learning rate
   ExpectToken(is, binary, "<Params>");
   scales_.Read(is, binary);
@@ -3327,7 +3329,7 @@ void NaturalGradientPerElementScaleComponent::Read(
 }
 
 void NaturalGradientPerElementScaleComponent::Write(std::ostream &os,
-                                                    bool binary) const {
+    bool binary) const {
   WriteUpdatableCommon(os, binary);  // Write the opening tag and learning rate
   WriteToken(os, binary, "<Params>");
   scales_.Write(os, binary);
@@ -3360,15 +3362,15 @@ std::string NaturalGradientPerElementScaleComponent::Info() const {
 void NaturalGradientPerElementScaleComponent::InitFromConfig(ConfigLine *cfl) {
   // First set various configuration values that have defaults.
   int32 rank = 8,  // Use a small rank because in this case the amount of memory
-                   // for the preconditioner actually exceeds the memory for the
-                   // parameters (by "rank").
-      update_period = 10;
+        // for the preconditioner actually exceeds the memory for the
+        // parameters (by "rank").
+        update_period = 10;
   // the max_change_per_minibatch is the maximum amount of parameter-change, in 2-norm,
   // that we allow per minibatch; if change is greater than that, we scale down
   // the parameter-change.  It has the same purpose as the max-change-per-sample in
   // the NaturalGradientAffineComponent.
   BaseFloat num_samples_history = 2000.0, alpha = 4.0,
-      max_change_per_minibatch = 0.0;
+            max_change_per_minibatch = 0.0;
   cfl->GetValue("rank", &rank);
   cfl->GetValue("update-period", &update_period);
   cfl->GetValue("num-samples-history", &num_samples_history);
@@ -3400,10 +3402,10 @@ void NaturalGradientPerElementScaleComponent::InitFromConfig(ConfigLine *cfl) {
 }
 
 void NaturalGradientPerElementScaleComponent::Init(
-    int32 dim, BaseFloat param_mean,
-    BaseFloat param_stddev, int32 rank, int32 update_period,
-    BaseFloat num_samples_history, BaseFloat alpha,
-    BaseFloat max_change_per_minibatch) {
+  int32 dim, BaseFloat param_mean,
+  BaseFloat param_stddev, int32 rank, int32 update_period,
+  BaseFloat num_samples_history, BaseFloat alpha,
+  BaseFloat max_change_per_minibatch) {
   PerElementScaleComponent::Init(dim, param_mean,
                                  param_stddev);
   preconditioner_.SetRank(rank);
@@ -3419,9 +3421,9 @@ void NaturalGradientPerElementScaleComponent::Init(
 }
 
 void NaturalGradientPerElementScaleComponent::Init(
-    std::string vector_filename,
-    int32 rank, int32 update_period, BaseFloat num_samples_history,
-    BaseFloat alpha, BaseFloat max_change_per_minibatch) {
+  std::string vector_filename,
+  int32 rank, int32 update_period, BaseFloat num_samples_history,
+  BaseFloat alpha, BaseFloat max_change_per_minibatch) {
   PerElementScaleComponent::Init(vector_filename);
   preconditioner_.SetRank(rank);
   preconditioner_.SetUpdatePeriod(update_period);
@@ -3432,10 +3434,10 @@ void NaturalGradientPerElementScaleComponent::Init(
 
 
 NaturalGradientPerElementScaleComponent::NaturalGradientPerElementScaleComponent(
-    const NaturalGradientPerElementScaleComponent &other):
-    PerElementScaleComponent(other),
-    max_change_per_minibatch_(other.max_change_per_minibatch_),
-    preconditioner_(other.preconditioner_) { }
+  const NaturalGradientPerElementScaleComponent &other):
+  PerElementScaleComponent(other),
+  max_change_per_minibatch_(other.max_change_per_minibatch_),
+  preconditioner_(other.preconditioner_) { }
 
 
 
@@ -3445,9 +3447,9 @@ Component* NaturalGradientPerElementScaleComponent::Copy() const {
 }
 
 void NaturalGradientPerElementScaleComponent::Update(
-    const std::string &debug_info,
-    const CuMatrixBase<BaseFloat> &in_value,
-    const CuMatrixBase<BaseFloat> &out_deriv) {
+  const std::string &debug_info,
+  const CuMatrixBase<BaseFloat> &in_value,
+  const CuMatrixBase<BaseFloat> &out_deriv) {
 
   CuMatrix<BaseFloat> derivs_per_frame(in_value);
   derivs_per_frame.MulElements(out_deriv);
@@ -3464,46 +3466,46 @@ void NaturalGradientPerElementScaleComponent::Update(
 
 // Constructors for the convolution component
 ConvolutionComponent::ConvolutionComponent():
-    UpdatableComponent(),
-    input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
-    filt_x_dim_(0), filt_y_dim_(0),
-    filt_x_step_(0), filt_y_step_(0),
-    input_vectorization_(kZyx),
-    is_gradient_(false) {}
+  UpdatableComponent(),
+  input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
+  filt_x_dim_(0), filt_y_dim_(0),
+  filt_x_step_(0), filt_y_step_(0),
+  input_vectorization_(kZyx),
+  is_gradient_(false) {}
 
 ConvolutionComponent::ConvolutionComponent(
-    const ConvolutionComponent &component):
-    UpdatableComponent(component),
-    input_x_dim_(component.input_x_dim_),
-    input_y_dim_(component.input_y_dim_),
-    input_z_dim_(component.input_z_dim_),
-    filt_x_dim_(component.filt_x_dim_),
-    filt_y_dim_(component.filt_y_dim_),
-    filt_x_step_(component.filt_x_step_),
-    filt_y_step_(component.filt_y_step_),
-    input_vectorization_(component.input_vectorization_),
-    filter_params_(component.filter_params_),
-    bias_params_(component.bias_params_),
-    is_gradient_(component.is_gradient_) {}
+  const ConvolutionComponent &component):
+  UpdatableComponent(component),
+  input_x_dim_(component.input_x_dim_),
+  input_y_dim_(component.input_y_dim_),
+  input_z_dim_(component.input_z_dim_),
+  filt_x_dim_(component.filt_x_dim_),
+  filt_y_dim_(component.filt_y_dim_),
+  filt_x_step_(component.filt_x_step_),
+  filt_y_step_(component.filt_y_step_),
+  input_vectorization_(component.input_vectorization_),
+  filter_params_(component.filter_params_),
+  bias_params_(component.bias_params_),
+  is_gradient_(component.is_gradient_) {}
 
 ConvolutionComponent::ConvolutionComponent(
-    const CuMatrixBase<BaseFloat> &filter_params,
-    const CuVectorBase<BaseFloat> &bias_params,
-    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
-    int32 filt_x_dim, int32 filt_y_dim,
-    int32 filt_x_step, int32 filt_y_step,
-    TensorVectorizationType input_vectorization,
-    BaseFloat learning_rate):
-    input_x_dim_(input_x_dim),
-    input_y_dim_(input_y_dim),
-    input_z_dim_(input_z_dim),
-    filt_x_dim_(filt_x_dim),
-    filt_y_dim_(filt_y_dim),
-    filt_x_step_(filt_x_step),
-    filt_y_step_(filt_y_step),
-    input_vectorization_(input_vectorization),
-    filter_params_(filter_params),
-    bias_params_(bias_params){
+  const CuMatrixBase<BaseFloat> &filter_params,
+  const CuVectorBase<BaseFloat> &bias_params,
+  int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+  int32 filt_x_dim, int32 filt_y_dim,
+  int32 filt_x_step, int32 filt_y_step,
+  TensorVectorizationType input_vectorization,
+  BaseFloat learning_rate):
+  input_x_dim_(input_x_dim),
+  input_y_dim_(input_y_dim),
+  input_z_dim_(input_z_dim),
+  filt_x_dim_(filt_x_dim),
+  filt_y_dim_(filt_y_dim),
+  filt_x_step_(filt_x_step),
+  filt_y_step_(filt_y_step),
+  input_vectorization_(input_vectorization),
+  filter_params_(filter_params),
+  bias_params_(bias_params) {
   KALDI_ASSERT(filter_params.NumRows() == bias_params.Dim() &&
                bias_params.Dim() != 0);
   KALDI_ASSERT(filter_params.NumCols() == filt_x_dim * filt_y_dim * input_z_dim);
@@ -3526,11 +3528,11 @@ int32 ConvolutionComponent::OutputDim() const {
 
 // initialize the component using hyperparameters
 void ConvolutionComponent::Init(
-    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
-    int32 filt_x_dim, int32 filt_y_dim,
-    int32 filt_x_step, int32 filt_y_step, int32 num_filters,
-    TensorVectorizationType input_vectorization,
-    BaseFloat param_stddev, BaseFloat bias_stddev) {
+  int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+  int32 filt_x_dim, int32 filt_y_dim,
+  int32 filt_x_step, int32 filt_y_step, int32 num_filters,
+  TensorVectorizationType input_vectorization,
+  BaseFloat param_stddev, BaseFloat bias_stddev) {
   input_x_dim_ = input_x_dim;
   input_y_dim_ = input_y_dim;
   input_z_dim_ = input_z_dim;
@@ -3553,11 +3555,11 @@ void ConvolutionComponent::Init(
 
 // initialize the component using predefined matrix file
 void ConvolutionComponent::Init(
-    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
-    int32 filt_x_dim, int32 filt_y_dim,
-    int32 filt_x_step, int32 filt_y_step,
-    TensorVectorizationType input_vectorization,
-    std::string matrix_filename) {
+  int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+  int32 filt_x_dim, int32 filt_y_dim,
+  int32 filt_x_step, int32 filt_y_step,
+  TensorVectorizationType input_vectorization,
+  std::string matrix_filename) {
   input_x_dim_ = input_x_dim;
   input_y_dim_ = input_y_dim;
   input_z_dim_ = input_z_dim;
@@ -3650,7 +3652,7 @@ void ConvolutionComponent::InitFromConfig(ConfigLine *cfl) {
   }
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
-	      << cfl->UnusedValues();
+              << cfl->UnusedValues();
   if (!ok)
     KALDI_ERR << "Bad initializer " << cfl->WholeLine();
 }
@@ -3677,8 +3679,8 @@ inline int32 ZyxVectorIndex(int32 x, int32 y, int32 z,
 // 3D tensors to patches for convolution, each patch corresponds to
 // one dot product in the convolution
 void ConvolutionComponent::InputToInputPatches(
-    const CuMatrixBase<BaseFloat>& in,
-    CuMatrix<BaseFloat> *patches) const{
+  const CuMatrixBase<BaseFloat>& in,
+  CuMatrix<BaseFloat> *patches) const {
   int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_);
   int32 num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_);
   const int32 filt_x_step = filt_x_step_,
@@ -3707,9 +3709,9 @@ void ConvolutionComponent::InputToInputPatches(
                                                  input_z_dim);
             } else if (input_vectorization_ == kYzx)  {
               column_map[index] = YzxVectorIndex(x_step * filt_x_step + x,
-                                                  y_step * filt_y_step + y, z,
-                                                  input_x_dim, input_y_dim,
-                                                  input_z_dim);
+                                                 y_step * filt_y_step + y, z,
+                                                 input_x_dim, input_y_dim,
+                                                 input_z_dim);
             }
           }
         }
@@ -3724,8 +3726,8 @@ void ConvolutionComponent::InputToInputPatches(
 // propagation function
 // see function declaration in nnet-simple-component.h for details
 void ConvolutionComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
-                                         const CuMatrixBase<BaseFloat> &in,
-                                         CuMatrixBase<BaseFloat> *out) const {
+                                     const CuMatrixBase<BaseFloat> &in,
+                                     CuMatrixBase<BaseFloat> *out) const {
   const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
               num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
               num_filters = filter_params_.NumRows(),
@@ -3739,8 +3741,8 @@ void ConvolutionComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
                               kUndefined);
   InputToInputPatches(in, &patches);
   CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
-		  filter_params_, 0, filter_params_.NumRows(), 0,
-		  filter_params_.NumCols());
+    filter_params_, 0, filter_params_.NumRows(), 0,
+    filter_params_.NumCols());
   std::vector<CuSubMatrix<BaseFloat>* > tgt_batch, patch_batch,
       filter_params_batch;
 
@@ -3748,9 +3750,9 @@ void ConvolutionComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
     for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
       int32 patch_number = x_step * num_y_steps + y_step;
       tgt_batch.push_back(new CuSubMatrix<BaseFloat>(
-              out->ColRange(patch_number * num_filters, num_filters)));
+                            out->ColRange(patch_number * num_filters, num_filters)));
       patch_batch.push_back(new CuSubMatrix<BaseFloat>(
-              patches.ColRange(patch_number * filter_dim, filter_dim)));
+                              patches.ColRange(patch_number * filter_dim, filter_dim)));
       filter_params_batch.push_back(filter_params_elem);
       tgt_batch[patch_number]->AddVecToRows(1.0, bias_params_, 1.0); // add bias
     }
@@ -3776,7 +3778,7 @@ void ConvolutionComponent::Scale(BaseFloat scale) {
 // add another convolution component
 void ConvolutionComponent::Add(BaseFloat alpha, const Component &other_in) {
   const ConvolutionComponent *other =
-      dynamic_cast<const ConvolutionComponent*>(&other_in);
+    dynamic_cast<const ConvolutionComponent*>(&other_in);
   KALDI_ASSERT(other != NULL);
   filter_params_.AddMat(alpha, other->filter_params_);
   bias_params_.AddVec(alpha, other->bias_params_);
@@ -3794,7 +3796,7 @@ void ConvolutionComponent::Add(BaseFloat alpha, const Component &other_in) {
             where necessary if not all the input lists have the same side.
 */
 void RearrangeIndexes(const std::vector<std::vector<int32> > &in,
-                                                std::vector<std::vector<int32> > *out) {
+                      std::vector<std::vector<int32> > *out) {
   int32 D = in.size();
   int32 L = 0;
   for (int32 i = 0; i < D; i++)
@@ -3814,8 +3816,8 @@ void RearrangeIndexes(const std::vector<std::vector<int32> > &in,
 // for patches, where each patch corresponds to one dot product
 // in the convolution
 void ConvolutionComponent::InderivPatchesToInderiv(
-    const CuMatrix<BaseFloat>& in_deriv_patches,
-    CuMatrixBase<BaseFloat> *in_deriv) const {
+  const CuMatrix<BaseFloat>& in_deriv_patches,
+  CuMatrixBase<BaseFloat> *in_deriv) const {
 
   const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
               num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
@@ -3877,7 +3879,7 @@ void ConvolutionComponent::Backprop(const std::string &debug_info,
                                     Component *to_update_in,
                                     CuMatrixBase<BaseFloat> *in_deriv) const {
   ConvolutionComponent *to_update =
-      dynamic_cast<ConvolutionComponent*>(to_update_in);
+    dynamic_cast<ConvolutionComponent*>(to_update_in);
   const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
               num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
               num_filters = filter_params_.NumRows(),
@@ -3894,20 +3896,20 @@ void ConvolutionComponent::Backprop(const std::string &debug_info,
                                        kSetZero);
 
   std::vector<CuSubMatrix<BaseFloat>* > patch_deriv_batch, out_deriv_batch,
-	  filter_params_batch;
+      filter_params_batch;
   CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
-		  filter_params_, 0, filter_params_.NumRows(), 0,
-		  filter_params_.NumCols());
+    filter_params_, 0, filter_params_.NumRows(), 0,
+    filter_params_.NumCols());
 
   for (int32 x_step = 0; x_step < num_x_steps; x_step++)  {
     for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
       int32 patch_number = x_step * num_y_steps + y_step;
 
       patch_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(
-              in_deriv_patches.ColRange(
-              patch_number * filter_dim, filter_dim)));
+                                    in_deriv_patches.ColRange(
+                                      patch_number * filter_dim, filter_dim)));
       out_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(out_deriv.ColRange(
-              patch_number * num_filters, num_filters)));
+                                  patch_number * num_filters, num_filters)));
       filter_params_batch.push_back(filter_params_elem);
     }
   }
@@ -3965,8 +3967,8 @@ void ConvolutionComponent::Update(const std::string &debug_info,
   // create a single large matrix holding the smaller matrices
   // from the vector container filters_grad_batch along the rows
   CuMatrix<BaseFloat> filters_grad_blocks_batch(
-      num_x_steps * num_y_steps * filters_grad.NumRows(),
-      filters_grad.NumCols());
+    num_x_steps * num_y_steps * filters_grad.NumRows(),
+    filters_grad.NumCols());
 
   std::vector<CuSubMatrix<BaseFloat>* > filters_grad_batch, input_patch_batch;
 
@@ -3974,12 +3976,12 @@ void ConvolutionComponent::Update(const std::string &debug_info,
     for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
       int32 patch_number = x_step * num_y_steps + y_step;
       filters_grad_batch.push_back(new CuSubMatrix<BaseFloat>(
-              filters_grad_blocks_batch.RowRange(
-				      patch_number * filters_grad.NumRows(),
-				    filters_grad.NumRows())));
+                                     filters_grad_blocks_batch.RowRange(
+                                       patch_number * filters_grad.NumRows(),
+                                       filters_grad.NumRows())));
 
       input_patch_batch.push_back(new CuSubMatrix<BaseFloat>(
-              input_patches.ColRange(patch_number * filter_dim, filter_dim)));
+                                    input_patches.ColRange(patch_number * filter_dim, filter_dim)));
     }
   }
 
@@ -3991,7 +3993,7 @@ void ConvolutionComponent::Update(const std::string &debug_info,
 
   // create a matrix holding the col blocks sum of out_deriv
   CuMatrix<BaseFloat> out_deriv_col_blocks_sum(out_deriv.NumRows(),
-                                               num_filters);
+      num_filters);
 
   // add the col blocks together to out_deriv_col_blocks_sum
   out_deriv_col_blocks_sum.AddMatBlocks(1.0, out_deriv);
@@ -4084,7 +4086,7 @@ void ConvolutionComponent::Write(std::ostream &os, bool binary) const {
 
 BaseFloat ConvolutionComponent::DotProduct(const UpdatableComponent &other_in) const {
   const ConvolutionComponent *other =
-      dynamic_cast<const ConvolutionComponent*>(&other_in);
+    dynamic_cast<const ConvolutionComponent*>(&other_in);
   return TraceMatMat(filter_params_, other->filter_params_, kTrans)
          + VecVec(bias_params_, other->bias_params_);
 }
@@ -4134,16 +4136,16 @@ int32 MaxpoolingComponent::InputDim() const {
 }
 
 MaxpoolingComponent::MaxpoolingComponent(
-    const MaxpoolingComponent &component):
-    input_x_dim_(component.input_x_dim_),
-    input_y_dim_(component.input_y_dim_),
-    input_z_dim_(component.input_z_dim_),
-    pool_x_size_(component.pool_x_size_),
-    pool_y_size_(component.pool_y_size_),
-    pool_z_size_(component.pool_z_size_),
-    pool_x_step_(component.pool_x_step_),
-    pool_y_step_(component.pool_y_step_),
-    pool_z_step_(component.pool_z_step_) { }
+  const MaxpoolingComponent &component):
+  input_x_dim_(component.input_x_dim_),
+  input_y_dim_(component.input_y_dim_),
+  input_z_dim_(component.input_z_dim_),
+  pool_x_size_(component.pool_x_size_),
+  pool_y_size_(component.pool_y_size_),
+  pool_z_size_(component.pool_z_size_),
+  pool_x_step_(component.pool_x_step_),
+  pool_y_step_(component.pool_y_step_),
+  pool_z_step_(component.pool_z_step_) { }
 
 // aquire output dim
 int32 MaxpoolingComponent::OutputDim() const {
@@ -4203,15 +4205,15 @@ void MaxpoolingComponent::InitFromConfig(ConfigLine *cfl) {
 // 3D tensors to patches for 3d max pooling, each patch corresponds to
 // the nodes having the same local coordinatenodes from each pool
 void MaxpoolingComponent::InputToInputPatches(
-    const CuMatrixBase<BaseFloat>& in,
-    CuMatrix<BaseFloat> *patches) const{
+  const CuMatrixBase<BaseFloat>& in,
+  CuMatrix<BaseFloat> *patches) const {
   int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
   int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
   int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
 
   std::vector<int32> column_map(patches->NumCols());
   int32 column_map_size = column_map.size();
-  for (int32 x = 0, index =0; x < pool_x_size_; x++) {
+  for (int32 x = 0, index = 0; x < pool_x_size_; x++) {
     for (int32 y = 0; y < pool_y_size_; y++) {
       for (int32 z = 0; z < pool_z_size_; z++) {
         // given the local node coordinate, group them from each pool
@@ -4263,8 +4265,8 @@ void MaxpoolingComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
 // for patches, where each patch corresponds to
 // the nodes having the same local coordinatenodes from each pool
 void MaxpoolingComponent::InderivPatchesToInderiv(
-    const CuMatrix<BaseFloat>& in_deriv_patches,
-    CuMatrixBase<BaseFloat> *in_deriv) const {
+  const CuMatrix<BaseFloat>& in_deriv_patches,
+  CuMatrixBase<BaseFloat> *in_deriv) const {
 
   int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
   int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
@@ -4280,8 +4282,8 @@ void MaxpoolingComponent::InderivPatchesToInderiv(
           for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) {
             for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) {
               int32 vector_index = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ +
-                                  (y_pool * pool_y_step_ + y) * input_z_dim_ +
-                                  (z_pool * pool_z_step_ + z);
+                                   (y_pool * pool_y_step_ + y) * input_z_dim_ +
+                                   (z_pool * pool_z_step_ + z);
 
               KALDI_ASSERT(vector_index < rev_col_map_size);
               reverse_column_map[vector_index].push_back(index);
@@ -4448,7 +4450,7 @@ void PermuteComponent::InitFromConfig(ConfigLine *cfl) {
               << column_map_str;
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
-	      << cfl->UnusedValues();
+              << cfl->UnusedValues();
   if (!ok)
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
@@ -4514,7 +4516,7 @@ std::string PermuteComponent::Info() const {
 
 bool CompositeComponent::IsUpdatable() const {
   for (std::vector<Component*>::const_iterator iter = components_.begin(),
-           end = components_.end(); iter != end; ++iter)
+       end = components_.end(); iter != end; ++iter)
     if (((*iter)->Properties() & kUpdatableComponent) != 0)
       return true;
   return false;
@@ -4536,16 +4538,16 @@ int32 CompositeComponent::OutputDim() const {
 int32 CompositeComponent::Properties() const {
   KALDI_ASSERT(!components_.empty());
   int32 last_component_properties = components_.back()->Properties(),
-      first_component_properties = components_.front()->Properties();
+        first_component_properties = components_.front()->Properties();
   // We always assume backprop needs the input, as this would be necessary to
   // get the activations at intermediate layers, if these were not needed in
   // backprop, there would be no reason to use a CompositeComponent.
   int32 ans = kSimpleComponent | kBackpropNeedsInput |
-      (last_component_properties &
-       (kPropagateAdds|kBackpropNeedsOutput|kOutputContiguous)) |
-       (first_component_properties &
-        (kBackpropAdds|kInputContiguous)) |
-       (IsUpdatable() ? kUpdatableComponent : 0);
+              (last_component_properties &
+               (kPropagateAdds | kBackpropNeedsOutput | kOutputContiguous)) |
+              (first_component_properties &
+               (kBackpropAdds | kInputContiguous)) |
+              (IsUpdatable() ? kUpdatableComponent : 0);
   // note, we don't return the kStoresStats property because that function is
   // not implemented; instead, for efficiency, we call StoreStats() on any
   // sub-components as part of the backprop phase.
@@ -4568,13 +4570,13 @@ MatrixStrideType CompositeComponent::GetStrideType(int32 i) const {
 
 // virtual
 void CompositeComponent::Propagate(
-    const ComponentPrecomputedIndexes *, // indexes
-    const CuMatrixBase<BaseFloat> &in,
-    CuMatrixBase<BaseFloat> *out) const {
+  const ComponentPrecomputedIndexes *, // indexes
+  const CuMatrixBase<BaseFloat> &in,
+  CuMatrixBase<BaseFloat> *out) const {
   KALDI_ASSERT(in.NumRows() == out->NumRows() && in.NumCols() == InputDim() &&
                out->NumCols() == OutputDim());
   int32 num_rows = in.NumRows(),
-      num_components = components_.size();
+        num_components = components_.size();
   if (max_rows_process_ > 0 && num_rows > max_rows_process_) {
     // recurse and process smaller parts of the data, to save memory.
     for (int32 row_offset = 0; row_offset < num_rows;
@@ -4593,15 +4595,15 @@ void CompositeComponent::Propagate(
   for (int32 i = 0; i < num_components; i++) {
     if (i + 1 < num_components) {
       MatrixResizeType resize_type =
-          ((components_[i]->Properties() & kPropagateAdds) ?
-           kSetZero : kUndefined);
+        ((components_[i]->Properties() & kPropagateAdds) ?
+         kSetZero : kUndefined);
       intermediate_outputs[i].Resize(num_rows, components_[i]->OutputDim(),
                                      resize_type, GetStrideType(i));
     }
-    components_[i]->Propagate(NULL, (i == 0 ? in : intermediate_outputs[i-1]),
-               (i + 1 == num_components ? out : &(intermediate_outputs[i])));
+    components_[i]->Propagate(NULL, (i == 0 ? in : intermediate_outputs[i - 1]),
+                              (i + 1 == num_components ? out : & (intermediate_outputs[i])));
     if (i > 0)
-      intermediate_outputs[i-1].Resize(0, 0);
+      intermediate_outputs[i - 1].Resize(0, 0);
   }
 }
 
@@ -4619,7 +4621,7 @@ void CompositeComponent::Init(const std::vector<Component*> &components,
     if (i > 0) {
       // make sure all the internal dimensions match up.
       KALDI_ASSERT(components_[i]->InputDim() ==
-                   components_[i-1]->OutputDim());
+                   components_[i - 1]->OutputDim());
     }
   }
 }
@@ -4681,7 +4683,7 @@ void CompositeComponent::ZeroStats() {
   // will do nothing if the component doesn't store stats.  (components like
   // ReLU and sigmoid and tanh store stats on activations).
   for (size_t i = 0; i < components_.size(); i++)
-   components_[i]->ZeroStats();
+    components_[i]->ZeroStats();
 }
 
 // virtual
@@ -4710,7 +4712,7 @@ void CompositeComponent::Backprop(const std::string &debug_info,
                in_value.NumCols() == InputDim() &&
                out_deriv.NumCols() == OutputDim());
   int32 num_rows = in_value.NumRows(),
-      num_components = components_.size();
+        num_components = components_.size();
   if (max_rows_process_ > 0 && num_rows > max_rows_process_) {
     KALDI_ASSERT(max_rows_process_ > 0);
     // recurse and process smaller parts of the data, to save memory.
@@ -4722,18 +4724,18 @@ void CompositeComponent::Backprop(const std::string &debug_info,
       // out_value_part will only be used if out_value is nonempty; otherwise we
       // make it a submatrix of 'out_deriv' to avoid errors in the constructor.
       const CuSubMatrix<BaseFloat> out_value_part(have_output_value ? out_value : out_deriv,
-                                                  row_offset, this_num_rows,
-                                                  0, out_deriv.NumCols());
+          row_offset, this_num_rows,
+          0, out_deriv.NumCols());
       // in_deriv_value_part will only be used if in_deriv != NULL; otherwise we
       // make it a submatrix of 'in_value' to avoid errors in the constructor.
       CuSubMatrix<BaseFloat> in_deriv_part(in_deriv != NULL ? *in_deriv : in_value,
-                                            row_offset, this_num_rows,
-                                            0, in_value.NumCols());
+                                           row_offset, this_num_rows,
+                                           0, in_value.NumCols());
       CuSubMatrix<BaseFloat> in_value_part(in_value, row_offset, this_num_rows,
                                            0, in_value.NumCols());
       const CuSubMatrix<BaseFloat> out_deriv_part(out_deriv,
-                                                  row_offset, this_num_rows,
-                                                  0, out_deriv.NumCols());
+          row_offset, this_num_rows,
+          0, out_deriv.NumCols());
       CuMatrix<BaseFloat>  empty_mat;
       this->Backprop(debug_info, NULL, in_value_part,
                      (have_output_value ? static_cast<const CuMatrixBase<BaseFloat>&>(out_value_part) :
@@ -4759,27 +4761,27 @@ void CompositeComponent::Backprop(const std::string &debug_info,
     // backprop doesn't need the input and the one previous to that doesn't
     // need the output.  [lowest hanging fruit for optimization]
     if (i + 2 == num_components &&
-        !(components_[i+1]->Properties() & kBackpropNeedsInput) &&
+        !(components_[i + 1]->Properties() & kBackpropNeedsInput) &&
         !(components_[i]->Properties() & kBackpropNeedsOutput))
       break;
     MatrixResizeType resize_type =
-        ((components_[i]->Properties() & kPropagateAdds) ?
-         kSetZero : kUndefined);
+      ((components_[i]->Properties() & kPropagateAdds) ?
+       kSetZero : kUndefined);
     intermediate_outputs[i].Resize(num_rows, components_[i]->OutputDim(),
                                    resize_type, GetStrideType(i));
     components_[i]->Propagate(NULL,
-                              (i == 0 ? in_value : intermediate_outputs[i-1]),
+                              (i == 0 ? in_value : intermediate_outputs[i - 1]),
                               &(intermediate_outputs[i]));
   }
   for (int32 i = num_components - 1; i >= 0; i--) {
     Component *component_to_update =
-        (to_update == NULL ? NULL :
-         dynamic_cast<CompositeComponent*>(to_update)->components_[i]);
+      (to_update == NULL ? NULL :
+       dynamic_cast<CompositeComponent*>(to_update)->components_[i]);
 
     if (components_[i]->Properties() & kStoresStats &&
         component_to_update != NULL)
       component_to_update->StoreStats(
-          (i + 1 == num_components ? out_value : intermediate_outputs[i]));
+        (i + 1 == num_components ? out_value : intermediate_outputs[i]));
 
     // skip the first component's backprop if it's not updatable and in_deriv is
     // not requested.  Again, this is the lowest-hanging fruit to optimize.
@@ -4788,17 +4790,17 @@ void CompositeComponent::Backprop(const std::string &debug_info,
       break;
     if (i > 0) {
       MatrixResizeType resize_type =
-          ((components_[i]->Properties() & kBackpropAdds) ?
-           kSetZero : kUndefined);
-      intermediate_derivs[i-1].Resize(num_rows, components_[i]->InputDim(),
-                                      resize_type, GetStrideType(i - 1));
+        ((components_[i]->Properties() & kBackpropAdds) ?
+         kSetZero : kUndefined);
+      intermediate_derivs[i - 1].Resize(num_rows, components_[i]->InputDim(),
+                                        resize_type, GetStrideType(i - 1));
     }
     components_[i]->Backprop(debug_info, NULL,
-                             (i == 0 ? in_value : intermediate_outputs[i-1]),
+                             (i == 0 ? in_value : intermediate_outputs[i - 1]),
                              (i + 1 == num_components ? out_value : intermediate_outputs[i]),
                              (i + 1 == num_components ? out_deriv : intermediate_derivs[i]),
                              component_to_update,
-                             (i == 0 ? in_deriv : &(intermediate_derivs[i-1])));
+                             (i == 0 ? in_deriv : & (intermediate_derivs[i - 1])));
   }
 }
 
@@ -4809,7 +4811,7 @@ std::string CompositeComponent::Info() const {
   stream << Type() << " ";
   for (size_t i = 0; i < components_.size(); i++) {
     if (i > 0) stream << ", ";
-    stream << "sub-component" << (i+1) << " = { "
+    stream << "sub-component" << (i + 1) << " = { "
            << components_[i]->Info() << " }";
   }
   return stream.str();
@@ -4824,7 +4826,7 @@ void CompositeComponent::Scale(BaseFloat scale) {
 // virtual
 void CompositeComponent::Add(BaseFloat alpha, const Component &other_in) {
   const CompositeComponent *other = dynamic_cast<const CompositeComponent*>(
-      &other_in);
+                                      &other_in);
   KALDI_ASSERT(other != NULL && other->components_.size() ==
                components_.size() && "Mismatching nnet topologies");
   for (size_t i = 0; i < components_.size(); i++)
@@ -4837,7 +4839,7 @@ void CompositeComponent::SetZero(bool treat_as_gradient) {
   for (size_t i = 0; i < components_.size(); i++) {
     if (components_[i]->Properties() & kUpdatableComponent) {
       UpdatableComponent *uc =
-          dynamic_cast<UpdatableComponent*>(components_[i]);
+        dynamic_cast<UpdatableComponent*>(components_[i]);
       uc->SetZero(treat_as_gradient);
     }
   }
@@ -4849,7 +4851,7 @@ void CompositeComponent::PerturbParams(BaseFloat stddev) {
   for (size_t i = 0; i < components_.size(); i++) {
     if (components_[i]->Properties() & kUpdatableComponent) {
       UpdatableComponent *uc =
-          dynamic_cast<UpdatableComponent*>(components_[i]);
+        dynamic_cast<UpdatableComponent*>(components_[i]);
       uc->PerturbParams(stddev);
     }
   }
@@ -4865,7 +4867,7 @@ void CompositeComponent::SetUnderlyingLearningRate(BaseFloat lrate) {
   for (size_t i = 0; i < components_.size(); i++) {
     if (components_[i]->Properties() & kUpdatableComponent) {
       UpdatableComponent *uc =
-          dynamic_cast<UpdatableComponent*>(components_[i]);
+        dynamic_cast<UpdatableComponent*>(components_[i]);
       uc->SetUnderlyingLearningRate(effective_lrate);
     }
   }
@@ -4877,7 +4879,7 @@ void CompositeComponent::SetActualLearningRate(BaseFloat lrate) {
   for (size_t i = 0; i < components_.size(); i++) {
     if (components_[i]->Properties() & kUpdatableComponent) {
       UpdatableComponent *uc =
-          dynamic_cast<UpdatableComponent*>(components_[i]);
+        dynamic_cast<UpdatableComponent*>(components_[i]);
       uc->SetActualLearningRate(lrate);
     }
   }
@@ -4890,7 +4892,7 @@ int32 CompositeComponent::NumParameters() const {
   for (size_t i = 0; i < components_.size(); i++) {
     if (components_[i]->Properties() & kUpdatableComponent) {
       UpdatableComponent *uc =
-          dynamic_cast<UpdatableComponent*>(components_[i]);
+        dynamic_cast<UpdatableComponent*>(components_[i]);
       ans += uc->NumParameters();
     }
   }
@@ -4904,7 +4906,7 @@ void CompositeComponent::Vectorize(VectorBase<BaseFloat> *params) const {
   for (size_t i = 0; i < components_.size(); i++) {
     if (components_[i]->Properties() & kUpdatableComponent) {
       UpdatableComponent *uc =
-          dynamic_cast<UpdatableComponent*>(components_[i]);
+        dynamic_cast<UpdatableComponent*>(components_[i]);
       int32 this_size = uc->NumParameters();
       SubVector<BaseFloat> params_range(*params, cur_offset, this_size);
       uc->Vectorize(&params_range);
@@ -4921,7 +4923,7 @@ void CompositeComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
   for (size_t i = 0; i < components_.size(); i++) {
     if (components_[i]->Properties() & kUpdatableComponent) {
       UpdatableComponent *uc =
-          dynamic_cast<UpdatableComponent*>(components_[i]);
+        dynamic_cast<UpdatableComponent*>(components_[i]);
       int32 this_size = uc->NumParameters();
       SubVector<BaseFloat> params_range(params, cur_offset, this_size);
       uc->UnVectorize(params_range);
@@ -4933,18 +4935,18 @@ void CompositeComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
 
 // virtual
 BaseFloat CompositeComponent::DotProduct(
-    const UpdatableComponent &other_in) const {
+  const UpdatableComponent &other_in) const {
   const CompositeComponent *other = dynamic_cast<const CompositeComponent*>(
-      &other_in);
+                                      &other_in);
   KALDI_ASSERT(other != NULL && other->components_.size() ==
                components_.size() && "Mismatching nnet topologies");
   BaseFloat ans = 0.0;
   for (size_t i = 0.0; i < components_.size(); i++) {
     if (components_[i]->Properties() & kUpdatableComponent) {
       UpdatableComponent *uc =
-          dynamic_cast<UpdatableComponent*>(components_[i]);
+        dynamic_cast<UpdatableComponent*>(components_[i]);
       const UpdatableComponent *uc_other =
-          dynamic_cast<UpdatableComponent*>(other->components_[i]);
+        dynamic_cast<UpdatableComponent*>(other->components_[i]);
       KALDI_ASSERT(uc != NULL && uc_other != NULL);
       ans += uc->DotProduct(*uc_other);
     }
@@ -4994,7 +4996,7 @@ void CompositeComponent::InitFromConfig(ConfigLine *cfl) {
                 << "(or undefined or bad component type [type=xxx]), in "
                 << "CompositeComponent config line '" << cfl->WholeLine() << "'";
     }
-    if(this_component->Type() == "CompositeComponent") {
+    if (this_component->Type() == "CompositeComponent") {
       DeletePointers(&components);
       delete this_component;
       KALDI_ERR << "Found CompositeComponent nested within CompositeComponent."
@@ -5117,7 +5119,8 @@ std::string LstmNonlinearityComponent::Info() const {
            << std::setprecision(6);
   }
   static const char *nonlin_names[] = { "i_t_sigmoid", "f_t_sigmoid", "c_t_tanh",
-                                        "o_t_sigmoid", "m_t_tanh" };
+                                        "o_t_sigmoid", "m_t_tanh"
+                                      };
   for (int32 i = 0; i < 5; i++) {
     stream << ", " << nonlin_names[i] << "={";
     stream << " self-repair-lower-threshold=" << self_repair_config_(i)
@@ -5125,10 +5128,10 @@ std::string LstmNonlinearityComponent::Info() const {
 
     if (count_ != 0) {
       BaseFloat self_repaired_proportion =
-          self_repair_total_(i) / (count_ * cell_dim);
+        self_repair_total_(i) / (count_ * cell_dim);
       stream << ", self-repaired-proportion=" << self_repaired_proportion;
       Vector<double> value_sum(value_sum_.Row(i)),
-          deriv_sum(deriv_sum_.Row(i));
+             deriv_sum(deriv_sum_.Row(i));
       Vector<BaseFloat> value_avg(value_sum), deriv_avg(deriv_sum);
       value_avg.Scale(1.0 / count_);
       deriv_avg.Scale(1.0 / count_);
@@ -5156,7 +5159,7 @@ void LstmNonlinearityComponent::Scale(BaseFloat scale) {
 void LstmNonlinearityComponent::Add(BaseFloat alpha,
                                     const Component &other_in) {
   const LstmNonlinearityComponent *other =
-      dynamic_cast<const LstmNonlinearityComponent*>(&other_in);
+    dynamic_cast<const LstmNonlinearityComponent*>(&other_in);
   KALDI_ASSERT(other != NULL);
   params_.AddMat(alpha, other->params_);
   value_sum_.AddMat(alpha, other->value_sum_);
@@ -5184,9 +5187,9 @@ void LstmNonlinearityComponent::PerturbParams(BaseFloat stddev) {
 }
 
 BaseFloat LstmNonlinearityComponent::DotProduct(
-    const UpdatableComponent &other_in) const {
+  const UpdatableComponent &other_in) const {
   const LstmNonlinearityComponent *other =
-      dynamic_cast<const LstmNonlinearityComponent*>(&other_in);
+    dynamic_cast<const LstmNonlinearityComponent*>(&other_in);
   KALDI_ASSERT(other != NULL);
   return TraceMatMat(params_, other->params_, kTrans);
 }
@@ -5202,28 +5205,28 @@ void LstmNonlinearityComponent::Vectorize(VectorBase<BaseFloat> *params) const {
 
 
 void LstmNonlinearityComponent::UnVectorize(
-    const VectorBase<BaseFloat> &params)  {
+  const VectorBase<BaseFloat> &params)  {
   KALDI_ASSERT(params.Dim() == NumParameters());
   params_.CopyRowsFromVec(params);
 }
 
 
 void LstmNonlinearityComponent::Propagate(
-    const ComponentPrecomputedIndexes *, // indexes
-    const CuMatrixBase<BaseFloat> &in,
-    CuMatrixBase<BaseFloat> *out) const {
+  const ComponentPrecomputedIndexes *, // indexes
+  const CuMatrixBase<BaseFloat> &in,
+  CuMatrixBase<BaseFloat> *out) const {
   cu::ComputeLstmNonlinearity(in, params_, out);
 }
 
 
 void LstmNonlinearityComponent::Backprop(
-    const std::string &debug_info,
-    const ComponentPrecomputedIndexes *indexes,
-    const CuMatrixBase<BaseFloat> &in_value,
-    const CuMatrixBase<BaseFloat> &, // out_value,
-    const CuMatrixBase<BaseFloat> &out_deriv,
-    Component *to_update_in,
-    CuMatrixBase<BaseFloat> *in_deriv) const {
+  const std::string &debug_info,
+  const ComponentPrecomputedIndexes *indexes,
+  const CuMatrixBase<BaseFloat> &in_value,
+  const CuMatrixBase<BaseFloat> &, // out_value,
+  const CuMatrixBase<BaseFloat> &out_deriv,
+  Component *to_update_in,
+  CuMatrixBase<BaseFloat> *in_deriv) const {
 
   if (to_update_in == NULL) {
     cu::BackpropLstmNonlinearity(in_value, params_, out_deriv,
@@ -5235,7 +5238,7 @@ void LstmNonlinearityComponent::Backprop(
                                  (CuMatrixBase<BaseFloat>*) NULL);
   } else {
     LstmNonlinearityComponent *to_update =
-        dynamic_cast<LstmNonlinearityComponent*>(to_update_in);
+      dynamic_cast<LstmNonlinearityComponent*>(to_update_in);
     KALDI_ASSERT(to_update != NULL);
 
     int32 cell_dim = params_.NumCols();
@@ -5257,7 +5260,7 @@ void LstmNonlinearityComponent::Backprop(
     BaseFloat scale = 1.0;
     if (!to_update->is_gradient_) {
       to_update->preconditioner_.PreconditionDirections(
-          &params_deriv, NULL, &scale);
+        &params_deriv, NULL, &scale);
     }
     to_update->params_.AddMat(to_update->learning_rate_ * scale,
                               params_deriv);
@@ -5265,21 +5268,21 @@ void LstmNonlinearityComponent::Backprop(
 }
 
 LstmNonlinearityComponent::LstmNonlinearityComponent(
-    const LstmNonlinearityComponent &other):
-    UpdatableComponent(other),
-    params_(other.params_),
-    value_sum_(other.value_sum_),
-    deriv_sum_(other.deriv_sum_),
-    self_repair_config_(other.self_repair_config_),
-    self_repair_total_(other.self_repair_total_),
-    count_(other.count_),
-    preconditioner_(other.preconditioner_) { }
+  const LstmNonlinearityComponent &other):
+  UpdatableComponent(other),
+  params_(other.params_),
+  value_sum_(other.value_sum_),
+  deriv_sum_(other.deriv_sum_),
+  self_repair_config_(other.self_repair_config_),
+  self_repair_total_(other.self_repair_total_),
+  count_(other.count_),
+  preconditioner_(other.preconditioner_) { }
 
 void LstmNonlinearityComponent::Init(
-    int32 cell_dim, BaseFloat param_stddev,
-    BaseFloat tanh_self_repair_threshold,
-    BaseFloat sigmoid_self_repair_threshold,
-    BaseFloat self_repair_scale) {
+  int32 cell_dim, BaseFloat param_stddev,
+  BaseFloat tanh_self_repair_threshold,
+  BaseFloat sigmoid_self_repair_threshold,
+  BaseFloat self_repair_scale) {
   KALDI_ASSERT(cell_dim > 0 && param_stddev >= 0.0 &&
                tanh_self_repair_threshold >= 0.0 &&
                tanh_self_repair_threshold <= 1.0 &&
@@ -5326,8 +5329,8 @@ void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
   // self-repair config values for the individual sigmoid and tanh
   // nonlinearities, we can modify this code then.
   BaseFloat tanh_self_repair_threshold = 0.2,
-      sigmoid_self_repair_threshold = 0.05,
-      self_repair_scale = 1.0e-05;
+            sigmoid_self_repair_threshold = 0.05,
+            self_repair_scale = 1.0e-05;
   // param_stddev is the stddev of the parameters.  it may be better to
   // use a smaller value but this was the default in the python scripts
   // for a while.
@@ -5346,7 +5349,7 @@ void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
 
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
-	      << cfl->UnusedValues();
+              << cfl->UnusedValues();
   if (!ok)
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";

From d0290c35c2d3900dea4d4a940ef61a9ab41b353b Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Mon, 20 Feb 2017 11:11:36 +0800
Subject: [PATCH 12/12] Revert "sublime tool to formate
 nnet-simple-component.cc"

This reverts commit 463a4dc2a79a731efe0f96ba7ea912133787e8e8.
---
 src/nnet3/nnet-simple-component.cc | 959 ++++++++++++++---------------
 1 file changed, 478 insertions(+), 481 deletions(-)

diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 2c565283b17..a94486fe309 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -41,7 +41,7 @@ void PnormComponent::InitFromConfig(ConfigLine *cfl) {
   int32 input_dim = 0;
   int32 output_dim = 0;
   bool ok = cfl->GetValue("output-dim", &output_dim) &&
-            cfl->GetValue("input-dim", &input_dim);
+      cfl->GetValue("input-dim", &input_dim);
   if (!ok || cfl->HasUnusedValues() || output_dim <= 0)
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
@@ -99,11 +99,11 @@ void DropoutComponent::InitFromConfig(ConfigLine *cfl) {
   BaseFloat dropout_proportion = 0.0;
   bool dropout_per_frame = false;
   bool ok = cfl->GetValue("dim", &dim) &&
-            cfl->GetValue("dropout-proportion", &dropout_proportion) &&
-            cfl->GetValue("dropout-per-frame", &dropout_per_frame);
+    cfl->GetValue("dropout-proportion", &dropout_proportion) &&
+    cfl->GetValue("dropout-per-frame", &dropout_per_frame);
   if (!ok || cfl->HasUnusedValues() || dim <= 0 ||
       dropout_proportion < 0.0 || dropout_proportion > 1.0 ||
-      (dropout_per_frame != false and dropout_per_frame != true))
+     (dropout_per_frame != false and dropout_per_frame != true))
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
   Init(dim, dropout_proportion, dropout_per_frame);
@@ -125,14 +125,14 @@ void DropoutComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
 
   BaseFloat dropout = dropout_proportion_;
   KALDI_ASSERT(dropout >= 0.0 && dropout <= 1.0);
-  if (dropout_per_frame_) {
+  if(dropout_per_frame_) {
     // This const_cast is only safe assuming you don't attempt
     // to use multi-threaded code with the GPU.
     const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
 
     out->Add(-dropout); // now, a proportion "dropout" will be <0.0
     out->ApplyHeaviside(); // apply the function (x>0?1:0).  Now, a proportion "dropout" will
-    // be zero and (1 - dropout) will be 1.0.
+                          // be zero and (1 - dropout) will be 1.0.
 
     out->MulElements(in);
   } else {
@@ -173,17 +173,17 @@ void DropoutComponent::Read(std::istream &is, bool binary) {
   //back-compatibility code.
   std::string token;
   ReadToken(is, binary, &token);
-  if (token == "<DropoutComponent>") {
+  if(token == "<DropoutComponent>"){
     ReadToken(is, binary, &token);
   }
   KALDI_ASSERT(token == "<Dim>");
   ReadBasicType(is, binary, &dim_); // read dimension.
   ReadToken(is, binary, &token);
-  if (token == "<DropoutProportion>") {
+  if(token == "<DropoutProportion>"){
     ReadBasicType(is, binary, &dropout_proportion_); // read dropout rate
   }
   ReadToken(is, binary, &token);
-  if (token == "<DropoutPerFrame>") {
+  if(token == "<DropoutPerFrame>"){
     ReadBasicType(is, binary, &dropout_per_frame_); // read dropout mode
   }
   ReadToken(is, binary, &token);
@@ -212,7 +212,7 @@ void SumReduceComponent::InitFromConfig(ConfigLine *cfl) {
   int32 input_dim = 0;
   int32 output_dim = 0;
   bool ok = cfl->GetValue("output-dim", &output_dim) &&
-            cfl->GetValue("input-dim", &input_dim);
+      cfl->GetValue("input-dim", &input_dim);
   if (!ok || cfl->HasUnusedValues() || output_dim <= 0)
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
@@ -285,7 +285,7 @@ void ElementwiseProductComponent::InitFromConfig(ConfigLine *cfl) {
   int32 input_dim = 0;
   int32 output_dim = 0;
   bool ok = cfl->GetValue("output-dim", &output_dim) &&
-            cfl->GetValue("input-dim", &input_dim);
+      cfl->GetValue("input-dim", &input_dim);
   if (!ok || cfl->HasUnusedValues() || output_dim <= 0)
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
@@ -293,9 +293,9 @@ void ElementwiseProductComponent::InitFromConfig(ConfigLine *cfl) {
 }
 
 void ElementwiseProductComponent::Propagate(
-  const ComponentPrecomputedIndexes *indexes,
-  const CuMatrixBase<BaseFloat> &in,
-  CuMatrixBase<BaseFloat> *out) const {
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
   KALDI_ASSERT(in.NumCols() == input_dim_);
   int32 num_inputs = input_dim_ / output_dim_;
   for (int32 i = 0; i < num_inputs; i++)  {
@@ -310,12 +310,12 @@ void ElementwiseProductComponent::Propagate(
 }
 
 void ElementwiseProductComponent::Backprop(const std::string &debug_info,
-    const ComponentPrecomputedIndexes *indexes,
-    const CuMatrixBase<BaseFloat> &in_value,
-    const CuMatrixBase<BaseFloat> &out_value,
-    const CuMatrixBase<BaseFloat> &out_deriv,
-    Component *to_update,
-    CuMatrixBase<BaseFloat> *in_deriv) const {
+                              const ComponentPrecomputedIndexes *indexes,
+                              const CuMatrixBase<BaseFloat> &in_value,
+                              const CuMatrixBase<BaseFloat> &out_value,
+                              const CuMatrixBase<BaseFloat> &out_deriv,
+                              Component *to_update,
+                              CuMatrixBase<BaseFloat> *in_deriv) const {
   if (!in_deriv)  return;
   int32 num_inputs = input_dim_ / output_dim_;
   for (int32 i = 0; i < num_inputs; i++)  {
@@ -327,9 +327,9 @@ void ElementwiseProductComponent::Backprop(const std::string &debug_info,
       if (i == j)
         continue;
       CuSubMatrix<BaseFloat> in_value_partition(in_value, 0,
-          in_value.NumRows(),
-          j * output_dim_,
-          output_dim_);
+                                                in_value.NumRows(),
+                                                j * output_dim_,
+                                                output_dim_);
       current_in_deriv.MulElements(in_value_partition);
     }
   }
@@ -354,7 +354,7 @@ void ElementwiseProductComponent::Write(std::ostream &os, bool binary) const {
 }
 
 const BaseFloat NormalizeComponent::kSquaredNormFloor =
-  pow(2.0, NormalizeComponent::kExpSquaredNormFloor);
+    pow(2.0, NormalizeComponent::kExpSquaredNormFloor);
 
 // This component modifies the vector of activations by scaling it
 // so that the root-mean-square equals 1.0.  It's important that its
@@ -369,15 +369,15 @@ void NormalizeComponent::Init(int32 input_dim, BaseFloat target_rms,
 }
 
 NormalizeComponent::NormalizeComponent(const NormalizeComponent &other):
-  input_dim_(other.input_dim_), target_rms_(other.target_rms_),
-  add_log_stddev_(other.add_log_stddev_) { }
+    input_dim_(other.input_dim_), target_rms_(other.target_rms_),
+    add_log_stddev_(other.add_log_stddev_) { }
 
 void NormalizeComponent::InitFromConfig(ConfigLine *cfl) {
   int32 input_dim = 0;
   bool add_log_stddev = false;
   BaseFloat target_rms = 1.0;
   bool ok = cfl->GetValue("dim", &input_dim) ||
-            cfl->GetValue("input-dim", &input_dim);
+      cfl->GetValue("input-dim", &input_dim);
   cfl->GetValue("target-rms", &target_rms);
   cfl->GetValue("add-log-stddev", &add_log_stddev);
   if (!ok || cfl->HasUnusedValues() || input_dim <= 0 || target_rms <= 0.0)
@@ -486,8 +486,8 @@ void NormalizeComponent::Backprop(const std::string &debug_info,
                                   CuMatrixBase<BaseFloat> *in_deriv) const {
   if (!in_deriv)  return;
   const CuSubMatrix<BaseFloat> out_deriv_no_log(out_deriv,
-      0, out_deriv.NumRows(),
-      0, input_dim_);
+                                                0, out_deriv.NumRows(),
+                                                0, input_dim_);
   CuVector<BaseFloat> dot_products(out_deriv.NumRows());
   dot_products.AddDiagMatMat(1.0, out_deriv_no_log, kNoTrans,
                              in_value, kTrans, 0.0);
@@ -497,7 +497,7 @@ void NormalizeComponent::Backprop(const std::string &debug_info,
 
   if (add_log_stddev_) {
     CuVector<BaseFloat> log_stddev_deriv(in_norm), // log_stddev deriv as dF/dy .* (x^T x)^-1
-             out_deriv_for_stddev(out_deriv.NumRows(), kUndefined);
+        out_deriv_for_stddev(out_deriv.NumRows(), kUndefined);
     // f = log(sqrt(max(epsi, x^T x / D)))
     // df/dx = epsi^2 * D < x^T x ? (1/(x^T x)) * x  : 0.
     // we don't compute this exactly below for the case wehn x^2 x is very
@@ -550,9 +550,9 @@ void SigmoidComponent::Backprop(const std::string &debug_info,
 }
 
 void SigmoidComponent::RepairGradients(
-  const CuMatrixBase<BaseFloat> &out_value,
-  CuMatrixBase<BaseFloat> *in_deriv,
-  SigmoidComponent *to_update) const {
+    const CuMatrixBase<BaseFloat> &out_value,
+    CuMatrixBase<BaseFloat> *in_deriv,
+    SigmoidComponent *to_update) const {
   KALDI_ASSERT(to_update != NULL);
   // maximum possible derivative of SigmoidComponent is 0.25.
   // the default lower-threshold on the derivative, below which we
@@ -577,7 +577,7 @@ void SigmoidComponent::RepairGradients(
   BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
                                default_lower_threshold :
                                self_repair_lower_threshold_) *
-                              count_;
+      count_;
   if (self_repair_upper_threshold_ != unset) {
     KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid "
               << "components, it does nothing.";
@@ -639,8 +639,8 @@ void SigmoidComponent::StoreStats(const CuMatrixBase<BaseFloat> &out_value) {
 
 
 void NoOpComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
-                              const CuMatrixBase<BaseFloat> &in,
-                              CuMatrixBase<BaseFloat> *out) const {
+                                 const CuMatrixBase<BaseFloat> &in,
+                                 CuMatrixBase<BaseFloat> *out) const {
   out->CopyFromMat(in);
 }
 
@@ -728,7 +728,7 @@ std::string ClipGradientComponent::Info() const {
          << (norm_based_clipping_ ? "true" : "false")
          << ", clipping-threshold=" << clipping_threshold_
          << ", clipped-proportion="
-         << (count_ > 0 ? static_cast<BaseFloat>(num_clipped_) / count_ : 0);
+         << (count_ > 0 ? static_cast<BaseFloat>(num_clipped_)/count_ : 0);
   if (self_repair_scale_ != 0.0)
     stream << ", self-repair-clipped-proportion-threshold="
            << self_repair_clipped_proportion_threshold_
@@ -748,13 +748,13 @@ void ClipGradientComponent::Init(int32 dim,
                                  int32 num_self_repaired,
                                  int32 num_backpropped)  {
   KALDI_ASSERT(clipping_threshold >= 0 && dim > 0 &&
-               self_repair_clipped_proportion_threshold >= 0.0 &&
-               self_repair_target >= 0.0 && self_repair_scale >= 0.0);
+      self_repair_clipped_proportion_threshold >= 0.0 &&
+      self_repair_target >= 0.0 && self_repair_scale >= 0.0);
   dim_ = dim;
   norm_based_clipping_ = norm_based_clipping;
   clipping_threshold_ = clipping_threshold;
   self_repair_clipped_proportion_threshold_ =
-    self_repair_clipped_proportion_threshold;
+      self_repair_clipped_proportion_threshold;
   self_repair_target_ = self_repair_target;
   self_repair_scale_ = self_repair_scale;
   num_clipped_ = num_clipped;
@@ -791,27 +791,27 @@ void ClipGradientComponent::InitFromConfig(ConfigLine *cfl) {
 }
 
 void ClipGradientComponent::Propagate(
-  const ComponentPrecomputedIndexes *indexes,
-  const CuMatrixBase<BaseFloat> &in,
-  CuMatrixBase<BaseFloat> *out) const {
+                                 const ComponentPrecomputedIndexes *indexes,
+                                 const CuMatrixBase<BaseFloat> &in,
+                                 CuMatrixBase<BaseFloat> *out) const {
   out->CopyFromMat(in);
 }
 
 
 void ClipGradientComponent::Backprop(const std::string &debug_info,
-                                     const ComponentPrecomputedIndexes *indexes,
-                                     const CuMatrixBase<BaseFloat> &in_value,
-                                     const CuMatrixBase<BaseFloat> &,
-                                     const CuMatrixBase<BaseFloat> &out_deriv,
-                                     Component *to_update_in, // may be NULL; may be identical
-                                     // to "this" or different.
-                                     CuMatrixBase<BaseFloat> *in_deriv) const {
+                             const ComponentPrecomputedIndexes *indexes,
+                             const CuMatrixBase<BaseFloat> &in_value,
+                             const CuMatrixBase<BaseFloat> &,
+                             const CuMatrixBase<BaseFloat> &out_deriv,
+                             Component *to_update_in, // may be NULL; may be identical
+                             // to "this" or different.
+                             CuMatrixBase<BaseFloat> *in_deriv) const {
   // the following statement will do nothing if in_deriv and out_deriv have same
   // memory.
   in_deriv->CopyFromMat(out_deriv);
 
   ClipGradientComponent *to_update =
-    dynamic_cast<ClipGradientComponent*>(to_update_in);
+      dynamic_cast<ClipGradientComponent*>(to_update_in);
 
   if (clipping_threshold_ > 0) {
     if (norm_based_clipping_) {
@@ -820,11 +820,11 @@ void ClipGradientComponent::Backprop(const std::string &debug_info,
       CuVector<BaseFloat> clipping_scales(in_deriv->NumRows());
       clipping_scales.AddDiagMat2(pow(clipping_threshold_, -2), *in_deriv,
                                   kNoTrans, 0.0);
-      // now clipping_scales contains the squared (norm of each row divided by
-      //  clipping_threshold)
+     // now clipping_scales contains the squared (norm of each row divided by
+     //  clipping_threshold)
       int32 num_not_scaled = clipping_scales.ApplyFloor(1.0);
-      // now clipping_scales contains min(1,
-      //    squared-(norm/clipping_threshold))
+     // now clipping_scales contains min(1,
+     //    squared-(norm/clipping_threshold))
       if (num_not_scaled != clipping_scales.Dim()) {
         clipping_scales.ApplyPow(-0.5);
         // now clipping_scales contains max(1,
@@ -832,7 +832,7 @@ void ClipGradientComponent::Backprop(const std::string &debug_info,
         in_deriv->MulRowsVec(clipping_scales);
         if (to_update != NULL)
           to_update->num_clipped_ += (clipping_scales.Dim() - num_not_scaled);
-      }
+       }
       if (to_update != NULL)
         to_update->count_ += clipping_scales.Dim();
     } else {
@@ -858,9 +858,9 @@ void ClipGradientComponent::Backprop(const std::string &debug_info,
 // comparable to the magnitude of input derivative, especially when the gradient
 // explosion is actually happening.
 void ClipGradientComponent::RepairGradients(
-  const std::string &debug_info,
-  const CuMatrixBase<BaseFloat> &in_value,
-  CuMatrixBase<BaseFloat> *in_deriv, ClipGradientComponent *to_update) const {
+    const std::string &debug_info,
+    const CuMatrixBase<BaseFloat> &in_value,
+    CuMatrixBase<BaseFloat> *in_deriv, ClipGradientComponent *to_update) const {
   KALDI_ASSERT(to_update != NULL);
 
   // we use this 'repair_probability' (hardcoded for now) to limit
@@ -951,7 +951,7 @@ void ClipGradientComponent::Scale(BaseFloat scale) {
 
 void ClipGradientComponent::Add(BaseFloat alpha, const Component &other_in) {
   const ClipGradientComponent *other =
-    dynamic_cast<const ClipGradientComponent*>(&other_in);
+      dynamic_cast<const ClipGradientComponent*>(&other_in);
   KALDI_ASSERT(other != NULL);
   count_ += alpha * other->count_;
   num_clipped_ += alpha * other->num_clipped_;
@@ -968,9 +968,9 @@ void TanhComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
 
 
 void TanhComponent::RepairGradients(
-  const CuMatrixBase<BaseFloat> &out_value,
-  CuMatrixBase<BaseFloat> *in_deriv,
-  TanhComponent *to_update) const {
+    const CuMatrixBase<BaseFloat> &out_value,
+    CuMatrixBase<BaseFloat> *in_deriv,
+    TanhComponent *to_update) const {
   KALDI_ASSERT(to_update != NULL);
   // maximum possible derivative of SigmoidComponent is 1.0
   // the default lower-threshold on the derivative, below which we
@@ -995,7 +995,7 @@ void TanhComponent::RepairGradients(
   BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
                                default_lower_threshold :
                                self_repair_lower_threshold_) *
-                              count_;
+      count_;
   if (self_repair_upper_threshold_ != unset) {
     KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid "
               << "components, it does nothing.";
@@ -1069,27 +1069,27 @@ void TanhComponent::StoreStats(const CuMatrixBase<BaseFloat> &out_value) {
 }
 
 void RectifiedLinearComponent::Propagate(
-  const ComponentPrecomputedIndexes *indexes,
-  const CuMatrixBase<BaseFloat> &in,
-  CuMatrixBase<BaseFloat> *out) const {
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
   // Apply rectified linear function (x >= 0 ? 1.0 : 0.0)
   out->CopyFromMat(in);
   out->ApplyFloor(0.0);
 }
 
 void RectifiedLinearComponent::Backprop(
-  const std::string &debug_info,
-  const ComponentPrecomputedIndexes *indexes,
-  const CuMatrixBase<BaseFloat> &, //in_value
-  const CuMatrixBase<BaseFloat> &out_value,
-  const CuMatrixBase<BaseFloat> &out_deriv,
-  Component *to_update_in,
-  CuMatrixBase<BaseFloat> *in_deriv) const {
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &, //in_value
+    const CuMatrixBase<BaseFloat> &out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    Component *to_update_in,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
   if (in_deriv != NULL) {
     in_deriv->Heaviside(out_value);
     in_deriv->MulElements(out_deriv);
     RectifiedLinearComponent *to_update =
-      dynamic_cast<RectifiedLinearComponent*>(to_update_in);
+        dynamic_cast<RectifiedLinearComponent*>(to_update_in);
     if (to_update != NULL)
       RepairGradients(in_deriv, to_update);
   }
@@ -1097,11 +1097,11 @@ void RectifiedLinearComponent::Backprop(
 
 
 void RectifiedLinearComponent::RepairGradients(
-  CuMatrixBase<BaseFloat> *in_deriv,
-  RectifiedLinearComponent *to_update) const {
+    CuMatrixBase<BaseFloat> *in_deriv,
+    RectifiedLinearComponent *to_update) const {
   KALDI_ASSERT(to_update != NULL);
   BaseFloat default_lower_threshold = 0.05,
-            default_upper_threshold = 0.95;
+      default_upper_threshold = 0.95;
   // we use this 'repair_probability' (hardcoded for now) to limit
   // this code to running on about half of the minibatches.
   BaseFloat repair_probability = 0.5;
@@ -1118,11 +1118,11 @@ void RectifiedLinearComponent::RepairGradients(
   BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
                                default_lower_threshold :
                                self_repair_lower_threshold_) *
-                              count_,
-                              upper_threshold = (self_repair_upper_threshold_ == unset ?
-                                  default_upper_threshold :
-                                  self_repair_upper_threshold_) *
-                                  count_;
+      count_,
+      upper_threshold = (self_repair_upper_threshold_ == unset ?
+                         default_upper_threshold :
+                         self_repair_upper_threshold_) *
+      count_;
 
   CuMatrix<BaseFloat> storage(2, dim_ + 2, kUndefined);
   CuSubVector<BaseFloat> thresholds_vec(storage.RowData(0) + dim_, 2);
@@ -1159,7 +1159,7 @@ void RectifiedLinearComponent::RepairGradients(
 
 
 void RectifiedLinearComponent::StoreStats(
-  const CuMatrixBase<BaseFloat> &out_value) {
+    const CuMatrixBase<BaseFloat> &out_value) {
   // only store stats about every other minibatch.
   if (RandInt(0, 1) == 0)
     return;
@@ -1183,24 +1183,24 @@ void AffineComponent::Resize(int32 input_dim, int32 output_dim) {
 
 void AffineComponent::Add(BaseFloat alpha, const Component &other_in) {
   const AffineComponent *other =
-    dynamic_cast<const AffineComponent*>(&other_in);
+      dynamic_cast<const AffineComponent*>(&other_in);
   KALDI_ASSERT(other != NULL);
   linear_params_.AddMat(alpha, other->linear_params_);
   bias_params_.AddVec(alpha, other->bias_params_);
 }
 
 AffineComponent::AffineComponent(const AffineComponent &component):
-  UpdatableComponent(component),
-  linear_params_(component.linear_params_),
-  bias_params_(component.bias_params_) { }
+    UpdatableComponent(component),
+    linear_params_(component.linear_params_),
+    bias_params_(component.bias_params_) { }
 
 AffineComponent::AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
                                  const CuVectorBase<BaseFloat> &bias_params,
                                  BaseFloat learning_rate):
-  linear_params_(linear_params),
-  bias_params_(bias_params) {
+    linear_params_(linear_params),
+    bias_params_(bias_params) {
   SetUnderlyingLearningRate(learning_rate);
-  KALDI_ASSERT(linear_params.NumRows() == bias_params.Dim() &&
+  KALDI_ASSERT(linear_params.NumRows() == bias_params.Dim()&&
                bias_params.Dim() != 0);
 }
 
@@ -1247,9 +1247,9 @@ Component* AffineComponent::Copy() const {
 
 BaseFloat AffineComponent::DotProduct(const UpdatableComponent &other_in) const {
   const AffineComponent *other =
-    dynamic_cast<const AffineComponent*>(&other_in);
+      dynamic_cast<const AffineComponent*>(&other_in);
   return TraceMatMat(linear_params_, other->linear_params_, kTrans)
-         + VecVec(bias_params_, other->bias_params_);
+      + VecVec(bias_params_, other->bias_params_);
 }
 
 void AffineComponent::Init(int32 input_dim, int32 output_dim,
@@ -1291,7 +1291,7 @@ void AffineComponent::InitFromConfig(ConfigLine *cfl) {
     ok = ok && cfl->GetValue("input-dim", &input_dim);
     ok = ok && cfl->GetValue("output-dim", &output_dim);
     BaseFloat param_stddev = 1.0 / std::sqrt(input_dim),
-              bias_stddev = 1.0;
+        bias_stddev = 1.0;
     cfl->GetValue("param-stddev", &param_stddev);
     cfl->GetValue("bias-stddev", &bias_stddev);
     Init(input_dim, output_dim,
@@ -1309,7 +1309,7 @@ void AffineComponent::InitFromConfig(ConfigLine *cfl) {
 
 void AffineComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
                                 const CuMatrixBase<BaseFloat> &in,
-                                CuMatrixBase<BaseFloat> *out) const {
+                                 CuMatrixBase<BaseFloat> *out) const {
 
   // No need for asserts as they'll happen within the matrix operations.
   out->CopyRowsFromVec(bias_params_); // copies bias_params_ to each row
@@ -1390,7 +1390,7 @@ void AffineComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
 }
 
 Component *AffineComponent::CollapseWithNext(
-  const AffineComponent &next_component) const {
+    const AffineComponent &next_component) const {
   AffineComponent *ans = dynamic_cast<AffineComponent*>(this->Copy());
   KALDI_ASSERT(ans != NULL);
   // Note: it's possible that "ans" is really of a derived type such
@@ -1408,10 +1408,10 @@ Component *AffineComponent::CollapseWithNext(
 }
 
 Component *AffineComponent::CollapseWithNext(
-  const FixedAffineComponent &next_component) const {
+    const FixedAffineComponent &next_component) const {
   // If at least one was non-updatable, make the whole non-updatable.
   FixedAffineComponent *ans =
-    dynamic_cast<FixedAffineComponent*>(next_component.Copy());
+      dynamic_cast<FixedAffineComponent*>(next_component.Copy());
   KALDI_ASSERT(ans != NULL);
   ans->linear_params_.Resize(next_component.OutputDim(), InputDim());
   ans->bias_params_ = next_component.bias_params_;
@@ -1424,10 +1424,10 @@ Component *AffineComponent::CollapseWithNext(
 }
 
 Component *AffineComponent::CollapseWithNext(
-  const FixedScaleComponent &next_component) const {
+    const FixedScaleComponent &next_component) const {
   KALDI_ASSERT(this->OutputDim() == next_component.InputDim());
   AffineComponent *ans =
-    dynamic_cast<AffineComponent*>(this->Copy());
+      dynamic_cast<AffineComponent*>(this->Copy());
   KALDI_ASSERT(ans != NULL);
   ans->linear_params_.MulRowsVec(next_component.scales_);
   ans->bias_params_.MulElements(next_component.scales_);
@@ -1436,10 +1436,10 @@ Component *AffineComponent::CollapseWithNext(
 }
 
 Component *AffineComponent::CollapseWithPrevious(
-  const FixedAffineComponent &prev_component) const {
+    const FixedAffineComponent &prev_component) const {
   // If at least one was non-updatable, make the whole non-updatable.
   FixedAffineComponent *ans =
-    dynamic_cast<FixedAffineComponent*>(prev_component.Copy());
+      dynamic_cast<FixedAffineComponent*>(prev_component.Copy());
   KALDI_ASSERT(ans != NULL);
 
   ans->linear_params_.Resize(this->OutputDim(), prev_component.InputDim());
@@ -1453,10 +1453,10 @@ Component *AffineComponent::CollapseWithPrevious(
 }
 
 RepeatedAffineComponent::RepeatedAffineComponent(const RepeatedAffineComponent & component) :
-  UpdatableComponent(component),
-  linear_params_(component.linear_params_),
-  bias_params_(component.bias_params_),
-  num_repeats_(component.num_repeats_) {}
+    UpdatableComponent(component),
+    linear_params_(component.linear_params_),
+    bias_params_(component.bias_params_),
+    num_repeats_(component.num_repeats_) {}
 
 
 void RepeatedAffineComponent::Scale(BaseFloat scale) {
@@ -1466,7 +1466,7 @@ void RepeatedAffineComponent::Scale(BaseFloat scale) {
 
 void RepeatedAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
   const RepeatedAffineComponent *other =
-    dynamic_cast<const RepeatedAffineComponent *>(&other_in);
+      dynamic_cast<const RepeatedAffineComponent *>(&other_in);
   KALDI_ASSERT(other != NULL);
   linear_params_.AddMat(alpha, other->linear_params_);
   bias_params_.AddVec(alpha, other->bias_params_);
@@ -1481,7 +1481,7 @@ void RepeatedAffineComponent::SetZero(bool treat_as_gradient) {
   bias_params_.SetZero();
 }
 
-void RepeatedAffineComponent::PerturbParams(BaseFloat stddev) {
+void RepeatedAffineComponent::PerturbParams(BaseFloat stddev){
   CuMatrix<BaseFloat> temp_linear_params(linear_params_);
   temp_linear_params.SetRandn();
   linear_params_.AddMat(stddev, temp_linear_params);
@@ -1506,9 +1506,9 @@ Component* RepeatedAffineComponent::Copy() const {
 
 BaseFloat RepeatedAffineComponent::DotProduct(const UpdatableComponent &other_in) const {
   const RepeatedAffineComponent *other =
-    dynamic_cast<const RepeatedAffineComponent*>(&other_in);
+      dynamic_cast<const RepeatedAffineComponent*>(&other_in);
   return TraceMatMat(linear_params_, other->linear_params_, kTrans)
-         + VecVec(bias_params_, other->bias_params_);
+                     + VecVec(bias_params_, other->bias_params_);
 }
 
 void RepeatedAffineComponent::Init(int32 input_dim, int32 output_dim, int32 num_repeats,
@@ -1541,7 +1541,7 @@ void RepeatedAffineComponent::InitFromConfig(ConfigLine *cfl) {
   KALDI_ASSERT(output_dim % num_repeats == 0 &&
                "num-repeats must divide output-dim");
   BaseFloat param_stddev = 1.0 / std::sqrt(input_dim / num_repeats),
-            bias_mean = 0.0, bias_stddev = 0.0;
+      bias_mean = 0.0, bias_stddev = 0.0;
   cfl->GetValue("param-stddev", &param_stddev);
   cfl->GetValue("bias-mean", &bias_mean);
   cfl->GetValue("bias-stddev", &bias_stddev);
@@ -1549,7 +1549,7 @@ void RepeatedAffineComponent::InitFromConfig(ConfigLine *cfl) {
        num_repeats, param_stddev, bias_mean, bias_stddev);
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
+	          << cfl->UnusedValues();
   if (!ok)
     KALDI_ERR << "Bad initializer " << cfl->WholeLine();
 }
@@ -1564,14 +1564,14 @@ void RepeatedAffineComponent::Propagate(const ComponentPrecomputedIndexes *index
                out->NumRows() == in.NumRows());
 
   int32 num_repeats = num_repeats_,
-        num_rows = in.NumRows(),
-        block_dim_out = linear_params_.NumRows(),
-        block_dim_in = linear_params_.NumCols();
+      num_rows = in.NumRows(),
+      block_dim_out = linear_params_.NumRows(),
+      block_dim_in = linear_params_.NumCols();
 
   CuSubMatrix<BaseFloat> in_reshaped(in.Data(), num_rows * num_repeats,
                                      block_dim_in, block_dim_in),
-                                                   out_reshaped(out->Data(), num_rows * num_repeats,
-                                                       block_dim_out, block_dim_out);
+      out_reshaped(out->Data(), num_rows * num_repeats,
+                   block_dim_out, block_dim_out);
 
   out_reshaped.CopyRowsFromVec(bias_params_);
 
@@ -1587,11 +1587,11 @@ void RepeatedAffineComponent::Backprop(const std::string &debug_info,
                                        Component *to_update_in,
                                        CuMatrixBase<BaseFloat> *in_deriv) const {
   KALDI_ASSERT(out_deriv.NumCols() == out_deriv.Stride() &&
-               (in_value.NumCols() == 0 || in_value.NumCols() == in_value.Stride()) &&
+       (in_value.NumCols() == 0 || in_value.NumCols() == in_value.Stride()) &&
                (!in_deriv || in_deriv->NumCols() == in_deriv->Stride()));
 
   RepeatedAffineComponent *to_update = dynamic_cast<RepeatedAffineComponent*>(
-                                         to_update_in);
+      to_update_in);
 
   // Propagate the derivative back to the input.
   // add with coefficient 1.0 since property kBackpropAdds is true.
@@ -1599,16 +1599,16 @@ void RepeatedAffineComponent::Backprop(const std::string &debug_info,
   // in_deriv, in case of infinities.
   if (in_deriv) {
     int32 num_repeats = num_repeats_,
-          num_rows = out_deriv.NumRows(),
-          block_dim_out = linear_params_.NumRows(),
-          block_dim_in = linear_params_.NumCols();
+        num_rows = out_deriv.NumRows(),
+        block_dim_out = linear_params_.NumRows(),
+        block_dim_in = linear_params_.NumCols();
 
     CuSubMatrix<BaseFloat> in_deriv_reshaped(in_deriv->Data(),
-        num_rows * num_repeats,
-        block_dim_in, block_dim_in),
-                      out_deriv_reshaped(out_deriv.Data(),
-                                         num_rows * num_repeats,
-                                         block_dim_out, block_dim_out);
+                                             num_rows * num_repeats,
+                                             block_dim_in, block_dim_in),
+        out_deriv_reshaped(out_deriv.Data(),
+                           num_rows * num_repeats,
+                           block_dim_out, block_dim_out);
     in_deriv_reshaped.AddMatMat(1.0, out_deriv_reshaped, kNoTrans,
                                 linear_params_, kNoTrans, 1.0);
   }
@@ -1626,17 +1626,17 @@ void RepeatedAffineComponent::Update(const CuMatrixBase<BaseFloat> &in_value,
                in_value.NumRows() == out_deriv.NumRows());
 
 
-  int32 num_repeats = num_repeats_,
+    int32 num_repeats = num_repeats_,
         num_rows = in_value.NumRows(),
         block_dim_out = linear_params_.NumRows(),
         block_dim_in = linear_params_.NumCols();
 
-  CuSubMatrix<BaseFloat> in_value_reshaped(in_value.Data(),
-      num_rows * num_repeats,
-      block_dim_in, block_dim_in),
-                    out_deriv_reshaped(out_deriv.Data(),
-                                       num_rows * num_repeats,
-                                       block_dim_out, block_dim_out);
+    CuSubMatrix<BaseFloat> in_value_reshaped(in_value.Data(),
+                                             num_rows * num_repeats,
+                                             block_dim_in, block_dim_in),
+        out_deriv_reshaped(out_deriv.Data(),
+                           num_rows * num_repeats,
+                           block_dim_out, block_dim_out);
 
 
   linear_params_.AddMatMat(learning_rate_, out_deriv_reshaped, kTrans,
@@ -1707,9 +1707,9 @@ void NaturalGradientRepeatedAffineComponent::SetNaturalGradientConfigs() {
 }
 
 NaturalGradientRepeatedAffineComponent::NaturalGradientRepeatedAffineComponent(
-  const NaturalGradientRepeatedAffineComponent &other):
-  RepeatedAffineComponent(other),
-  preconditioner_in_(other.preconditioner_in_) { }
+    const NaturalGradientRepeatedAffineComponent &other):
+    RepeatedAffineComponent(other),
+    preconditioner_in_(other.preconditioner_in_) { }
 
 // virtual
 Component* NaturalGradientRepeatedAffineComponent::Copy() const {
@@ -1717,23 +1717,23 @@ Component* NaturalGradientRepeatedAffineComponent::Copy() const {
 }
 
 void NaturalGradientRepeatedAffineComponent::Update(
-  const CuMatrixBase<BaseFloat> &in_value,
-  const CuMatrixBase<BaseFloat> &out_deriv) {
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &out_deriv) {
   KALDI_ASSERT(out_deriv.NumCols() == out_deriv.Stride() &&
                in_value.NumCols() == in_value.Stride() &&
                in_value.NumRows() == out_deriv.NumRows());
 
   int32 num_repeats = num_repeats_,
-        num_rows = in_value.NumRows(),
-        block_dim_out = linear_params_.NumRows(),
-        block_dim_in = linear_params_.NumCols();
+      num_rows = in_value.NumRows(),
+      block_dim_out = linear_params_.NumRows(),
+      block_dim_in = linear_params_.NumCols();
 
   CuSubMatrix<BaseFloat> in_value_reshaped(in_value.Data(),
-      num_rows * num_repeats,
-      block_dim_in, block_dim_in),
-                    out_deriv_reshaped(out_deriv.Data(),
-                                       num_rows * num_repeats,
-                                       block_dim_out, block_dim_out);
+                                           num_rows * num_repeats,
+                                           block_dim_in, block_dim_in),
+        out_deriv_reshaped(out_deriv.Data(),
+                           num_rows * num_repeats,
+                           block_dim_out, block_dim_out);
 
   CuVector<BaseFloat> bias_deriv(block_dim_out);
   bias_deriv.AddRowSumMat(1.0, out_deriv_reshaped);
@@ -1741,8 +1741,8 @@ void NaturalGradientRepeatedAffineComponent::Update(
   CuMatrix<BaseFloat> deriv(block_dim_out,
                             block_dim_in + 1);
   deriv.ColRange(0, block_dim_in).AddMatMat(
-    1.0, out_deriv_reshaped, kTrans,
-    in_value_reshaped, kNoTrans, 1.0);
+      1.0, out_deriv_reshaped, kTrans,
+      in_value_reshaped, kNoTrans, 1.0);
   deriv.CopyColFromVec(bias_deriv, block_dim_in);
 
   BaseFloat scale = 1.0;
@@ -1782,13 +1782,13 @@ BlockAffineComponent::BlockAffineComponent(const RepeatedAffineComponent &rac) :
   num_blocks_(rac.num_repeats_) {
   // copy rac's linear_params_ and bias_params_ to this.
   int32 num_rows_in_block = rac.linear_params_.NumRows();
-  for (int32 block_counter = 0; block_counter < num_blocks_; block_counter++) {
+  for(int32 block_counter = 0; block_counter < num_blocks_; block_counter++) {
     int32 row_offset = block_counter * num_rows_in_block;
     CuSubMatrix<BaseFloat> block = this->linear_params_.RowRange(row_offset,
-                                   num_rows_in_block);
+                                                                 num_rows_in_block);
     block.CopyFromMat(rac.linear_params_);
     CuSubVector<BaseFloat> block_bias = this->bias_params_.Range(row_offset,
-                                        num_rows_in_block);
+                                                                 num_rows_in_block);
     block_bias.CopyFromVec(rac.bias_params_);
   }
 }
@@ -1827,14 +1827,14 @@ void BlockAffineComponent::Init(int32 input_dim,
 
 void BlockAffineComponent::InitFromConfig(ConfigLine *cfl) {
   int32 input_dim = -1, output_dim = -1, num_blocks = -1;
-  if (!cfl->GetValue("input-dim", &input_dim) ||
-      !cfl->GetValue("output-dim", &output_dim) ||
-      !cfl->GetValue("num-blocks", &num_blocks))
+  if(!cfl->GetValue("input-dim", &input_dim) ||
+     !cfl->GetValue("output-dim", &output_dim) ||
+     !cfl->GetValue("num-blocks", &num_blocks))
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
   InitLearningRatesFromConfig(cfl);
   BaseFloat param_stddev = 1.0 / std::sqrt(input_dim / num_blocks),
-            bias_mean = 0.0, bias_stddev = 1.0;
+      bias_mean = 0.0, bias_stddev = 1.0;
   cfl->GetValue("param-stddev", &param_stddev);
   cfl->GetValue("bias-stddev", &bias_stddev);
   cfl->GetValue("bias-mean", &bias_mean);
@@ -1856,21 +1856,21 @@ void BlockAffineComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
   int32 num_rows_in_block = linear_params_.NumRows() / num_blocks_;
   int32 num_cols_in_block = linear_params_.NumCols();
   std::vector<CuSubMatrix<BaseFloat> *> in_batch, out_batch,
-      linear_params_batch;
-  for (int block_counter = 0; block_counter < num_blocks_; block_counter++) {
+    linear_params_batch;
+  for(int block_counter = 0; block_counter < num_blocks_; block_counter++) {
     CuSubMatrix<BaseFloat> *in_block =
       new CuSubMatrix<BaseFloat>(in.ColRange(block_counter * num_cols_in_block,
-                                 num_cols_in_block));
+                                   num_cols_in_block));
     in_batch.push_back(in_block);
 
     CuSubMatrix<BaseFloat> *out_block =
       new CuSubMatrix<BaseFloat>(out->ColRange(block_counter * num_rows_in_block,
-                                 num_rows_in_block));
+                                    num_rows_in_block));
     out_batch.push_back(out_block);
 
     CuSubMatrix<BaseFloat> *linear_params_block =
       new CuSubMatrix<BaseFloat>(linear_params_.RowRange(block_counter * num_rows_in_block,
-                                 num_rows_in_block));
+                                              num_rows_in_block));
     linear_params_batch.push_back(linear_params_block);
   }
   AddMatMatBatched<BaseFloat>(1.0, out_batch, in_batch, kNoTrans,
@@ -1900,20 +1900,20 @@ void BlockAffineComponent::Backprop(const std::string &debug_info,
   if (in_deriv) {
     std::vector<CuSubMatrix<BaseFloat> *> in_deriv_batch, out_deriv_batch, linear_params_batch;
 
-    for (int block_counter = 0; block_counter < num_blocks_; block_counter++) {
+    for(int block_counter = 0; block_counter < num_blocks_; block_counter++) {
       CuSubMatrix<BaseFloat> *in_deriv_block =
         new CuSubMatrix<BaseFloat>(in_deriv->ColRange(block_counter * num_cols_in_block,
-                                   num_cols_in_block));
+                                                      num_cols_in_block));
       in_deriv_batch.push_back(in_deriv_block);
 
       CuSubMatrix<BaseFloat> *out_deriv_block =
         new CuSubMatrix<BaseFloat>(out_deriv.ColRange(block_counter * num_rows_in_block,
-                                   num_rows_in_block));
+                                                       num_rows_in_block));
       out_deriv_batch.push_back(out_deriv_block);
 
       CuSubMatrix<BaseFloat> *linear_params_block =
         new CuSubMatrix<BaseFloat>(linear_params_.RowRange(block_counter * num_rows_in_block,
-                                   num_rows_in_block));
+                                                          num_rows_in_block));
       linear_params_batch.push_back(linear_params_block);
     }
 
@@ -1927,26 +1927,25 @@ void BlockAffineComponent::Backprop(const std::string &debug_info,
 
   if (to_update != NULL) {
 
-    {
-      // linear params update
+    { // linear params update
 
       std::vector<CuSubMatrix<BaseFloat> *> in_value_batch,
-          out_deriv_batch, linear_params_batch;
+        out_deriv_batch, linear_params_batch;
 
       for (int block_counter = 0; block_counter < num_blocks_; block_counter++) {
         CuSubMatrix<BaseFloat> *in_value_block =
           new CuSubMatrix<BaseFloat>(in_value.ColRange(block_counter * num_cols_in_block,
-                                     num_cols_in_block));
+                                                       num_cols_in_block));
         in_value_batch.push_back(in_value_block);
 
         CuSubMatrix<BaseFloat> *out_deriv_block =
           new CuSubMatrix<BaseFloat>(out_deriv.ColRange(block_counter * num_rows_in_block,
-                                     num_rows_in_block));
+                                                        num_rows_in_block));
         out_deriv_batch.push_back(out_deriv_block);
 
         CuSubMatrix<BaseFloat> *linear_params_block =
           new CuSubMatrix<BaseFloat>(to_update->linear_params_.RowRange(block_counter * num_rows_in_block,
-                                     num_rows_in_block));
+                                                                        num_rows_in_block));
         linear_params_batch.push_back(linear_params_block);
       }
 
@@ -1960,8 +1959,7 @@ void BlockAffineComponent::Backprop(const std::string &debug_info,
       DeletePointers(&linear_params_batch);
     } // end linear params update
 
-    {
-      // bias update
+    { // bias update
       to_update->bias_params_.AddRowSumMat(to_update->learning_rate_,
                                            out_deriv, 1.0);
     } // end bias update
@@ -2004,7 +2002,7 @@ BaseFloat BlockAffineComponent::DotProduct(const UpdatableComponent &other_in) c
   const BlockAffineComponent *other =
     dynamic_cast<const BlockAffineComponent*>(&other_in);
   return TraceMatMat(linear_params_, other->linear_params_, kTrans) +
-         VecVec(bias_params_, other->bias_params_);
+    VecVec(bias_params_, other->bias_params_);
 }
 
 void BlockAffineComponent::Read(std::istream &is, bool binary) {
@@ -2060,15 +2058,15 @@ void PerElementScaleComponent::Scale(BaseFloat scale) {
 void PerElementScaleComponent::Add(BaseFloat alpha,
                                    const Component &other_in) {
   const PerElementScaleComponent *other =
-    dynamic_cast<const PerElementScaleComponent*>(&other_in);
+      dynamic_cast<const PerElementScaleComponent*>(&other_in);
   KALDI_ASSERT(other != NULL);
   scales_.AddVec(alpha, other->scales_);
 }
 
 PerElementScaleComponent::PerElementScaleComponent(
-  const PerElementScaleComponent &component):
-  UpdatableComponent(component),
-  scales_(component.scales_) { }
+    const PerElementScaleComponent &component):
+    UpdatableComponent(component),
+    scales_(component.scales_) { }
 
 void PerElementScaleComponent::SetZero(bool treat_as_gradient) {
   if (treat_as_gradient) {
@@ -2098,9 +2096,9 @@ Component* PerElementScaleComponent::Copy() const {
 }
 
 BaseFloat PerElementScaleComponent::DotProduct(
-  const UpdatableComponent &other_in) const {
+    const UpdatableComponent &other_in) const {
   const PerElementScaleComponent *other =
-    dynamic_cast<const PerElementScaleComponent*>(&other_in);
+      dynamic_cast<const PerElementScaleComponent*>(&other_in);
   return VecVec(scales_, other->scales_);
 }
 
@@ -2131,7 +2129,7 @@ void PerElementScaleComponent::InitFromConfig(ConfigLine *cfl) {
       KALDI_ASSERT(dim == InputDim() &&
                    "input-dim mismatch vs. vector.");
   } else {
-    if (!cfl->GetValue("dim", &dim))
+    if(!cfl->GetValue("dim", &dim))
       KALDI_ERR << "'dim' not provided in the config line.";
     BaseFloat param_mean = 1.0, param_stddev = 0.0;
     cfl->GetValue("param-mean", &param_mean);
@@ -2144,30 +2142,30 @@ void PerElementScaleComponent::InitFromConfig(ConfigLine *cfl) {
 }
 
 void PerElementScaleComponent::Propagate(
-  const ComponentPrecomputedIndexes *indexes,
-  const CuMatrixBase<BaseFloat> &in,
-  CuMatrixBase<BaseFloat> *out) const {
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
   out->CopyFromMat(in);
   out->MulColsVec(scales_);
 }
 
 void PerElementScaleComponent::UpdateSimple(
-  const CuMatrixBase<BaseFloat> &in_value,
-  const CuMatrixBase<BaseFloat> &out_deriv) {
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &out_deriv) {
   scales_.AddDiagMatMat(learning_rate_, out_deriv, kTrans,
                         in_value, kNoTrans, 1.0);
 }
 
 void PerElementScaleComponent::Backprop(
-  const std::string &debug_info,
-  const ComponentPrecomputedIndexes *indexes,
-  const CuMatrixBase<BaseFloat> &in_value,
-  const CuMatrixBase<BaseFloat> &, // out_value
-  const CuMatrixBase<BaseFloat> &out_deriv,
-  Component *to_update_in,
-  CuMatrixBase<BaseFloat> *in_deriv) const {
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &, // out_value
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    Component *to_update_in,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
   PerElementScaleComponent *to_update =
-    dynamic_cast<PerElementScaleComponent*>(to_update_in);
+      dynamic_cast<PerElementScaleComponent*>(to_update_in);
 
   if (in_deriv) {
     // Propagate the derivative back to the input.
@@ -2212,7 +2210,7 @@ void PerElementScaleComponent::Vectorize(VectorBase<BaseFloat> *params) const {
 }
 
 void PerElementScaleComponent::UnVectorize(
-  const VectorBase<BaseFloat> &params) {
+    const VectorBase<BaseFloat> &params) {
   scales_.CopyFromVec(params);
 }
 
@@ -2222,17 +2220,17 @@ void PerElementOffsetComponent::Scale(BaseFloat scale) {
 
 
 void PerElementOffsetComponent::Add(BaseFloat alpha,
-                                    const Component &other_in) {
+                                   const Component &other_in) {
   const PerElementOffsetComponent *other =
-    dynamic_cast<const PerElementOffsetComponent*>(&other_in);
+      dynamic_cast<const PerElementOffsetComponent*>(&other_in);
   KALDI_ASSERT(other != NULL);
   offsets_.AddVec(alpha, other->offsets_);
 }
 
 PerElementOffsetComponent::PerElementOffsetComponent(
-  const PerElementOffsetComponent &component):
-  UpdatableComponent(component),
-  offsets_(component.offsets_) { }
+    const PerElementOffsetComponent &component):
+    UpdatableComponent(component),
+    offsets_(component.offsets_) { }
 
 void PerElementOffsetComponent::SetZero(bool treat_as_gradient) {
   if (treat_as_gradient) {
@@ -2262,9 +2260,9 @@ Component* PerElementOffsetComponent::Copy() const {
 }
 
 BaseFloat PerElementOffsetComponent::DotProduct(
-  const UpdatableComponent &other_in) const {
+    const UpdatableComponent &other_in) const {
   const PerElementOffsetComponent *other =
-    dynamic_cast<const PerElementOffsetComponent*>(&other_in);
+      dynamic_cast<const PerElementOffsetComponent*>(&other_in);
   return VecVec(offsets_, other->offsets_);
 }
 
@@ -2295,7 +2293,7 @@ void PerElementOffsetComponent::InitFromConfig(ConfigLine *cfl) {
       KALDI_ASSERT(dim == InputDim() &&
                    "input-dim mismatch vs. vector.");
   } else {
-    if (!cfl->GetValue("dim", &dim))
+    if(!cfl->GetValue("dim", &dim))
       KALDI_ERR << "'dim' not provided in the config line.";
     BaseFloat param_mean = 0.0, param_stddev = 0.0;
     cfl->GetValue("param-mean", &param_mean);
@@ -2308,23 +2306,23 @@ void PerElementOffsetComponent::InitFromConfig(ConfigLine *cfl) {
 }
 
 void PerElementOffsetComponent::Propagate(
-  const ComponentPrecomputedIndexes *indexes,
-  const CuMatrixBase<BaseFloat> &in,
-  CuMatrixBase<BaseFloat> *out) const {
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
   out->CopyFromMat(in);
   out->AddVecToRows(1.0, offsets_);
 }
 
 void PerElementOffsetComponent::Backprop(
-  const std::string &debug_info,
-  const ComponentPrecomputedIndexes *indexes,
-  const CuMatrixBase<BaseFloat> &, // in_value
-  const CuMatrixBase<BaseFloat> &, // out_value
-  const CuMatrixBase<BaseFloat> &out_deriv,
-  Component *to_update_in,
-  CuMatrixBase<BaseFloat> *in_deriv) const {
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &, // in_value
+    const CuMatrixBase<BaseFloat> &, // out_value
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    Component *to_update_in,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
   PerElementOffsetComponent *to_update =
-    dynamic_cast<PerElementOffsetComponent*>(to_update_in);
+      dynamic_cast<PerElementOffsetComponent*>(to_update_in);
 
   if (in_deriv) {
     // Propagate the derivative back to the input.
@@ -2362,7 +2360,7 @@ void PerElementOffsetComponent::Vectorize(VectorBase<BaseFloat> *params) const {
 }
 
 void PerElementOffsetComponent::UnVectorize(
-  const VectorBase<BaseFloat> &params) {
+    const VectorBase<BaseFloat> &params) {
   offsets_.CopyFromVec(params);
 }
 
@@ -2379,30 +2377,30 @@ std::string ConstantFunctionComponent::Info() const {
 }
 
 ConstantFunctionComponent::ConstantFunctionComponent():
-  input_dim_(-1), is_updatable_(true), use_natural_gradient_(true) { }
+    input_dim_(-1), is_updatable_(true), use_natural_gradient_(true) { }
 
 ConstantFunctionComponent::ConstantFunctionComponent(
-  const ConstantFunctionComponent &other):
-  input_dim_(other.input_dim_), output_(other.output_),
-  is_updatable_(other.is_updatable_),
-  use_natural_gradient_(other.use_natural_gradient_),
-  preconditioner_(other.preconditioner_) { }
+    const ConstantFunctionComponent &other):
+    input_dim_(other.input_dim_), output_(other.output_),
+    is_updatable_(other.is_updatable_),
+    use_natural_gradient_(other.use_natural_gradient_),
+    preconditioner_(other.preconditioner_) { }
 
 void ConstantFunctionComponent::Propagate(
-  const ComponentPrecomputedIndexes *indexes,
-  const CuMatrixBase<BaseFloat> &in,
-  CuMatrixBase<BaseFloat> *out) const {
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
   out->CopyRowsFromVec(output_);
 }
 
 void ConstantFunctionComponent::Backprop(
-  const std::string &debug_info,
-  const ComponentPrecomputedIndexes *indexes,
-  const CuMatrixBase<BaseFloat> &, // in_value
-  const CuMatrixBase<BaseFloat> &, // out_value
-  const CuMatrixBase<BaseFloat> &out_deriv,
-  Component *to_update_in,
-  CuMatrixBase<BaseFloat> *in_deriv) const {
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &, // in_value
+    const CuMatrixBase<BaseFloat> &, // out_value
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    Component *to_update_in,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
   // we don't update in_deriv, since we set the flag
   // kBackpropAdds, and the output doesn't depend on the
   // input, so the input-derivative is zero.
@@ -2416,7 +2414,7 @@ void ConstantFunctionComponent::Backprop(
         CuMatrix<BaseFloat> out_deriv_copy(out_deriv);
         BaseFloat scale = 1.0;
         to_update->preconditioner_.PreconditionDirections(&out_deriv_copy,
-            NULL, &scale);
+                                                          NULL, &scale);
         to_update->output_.AddRowSumMat(scale * to_update->learning_rate_,
                                         out_deriv_copy);
       } else {
@@ -2491,7 +2489,7 @@ void ConstantFunctionComponent::Scale(BaseFloat scale) {
 void ConstantFunctionComponent::Add(BaseFloat alpha, const Component &other_in) {
   if (is_updatable_) {
     const ConstantFunctionComponent *other =
-      dynamic_cast<const ConstantFunctionComponent*>(&other_in);
+        dynamic_cast<const ConstantFunctionComponent*>(&other_in);
     KALDI_ASSERT(other != NULL);
     output_.AddVec(alpha, other->output_);
   }
@@ -2512,10 +2510,10 @@ void ConstantFunctionComponent::PerturbParams(BaseFloat stddev) {
 }
 
 BaseFloat ConstantFunctionComponent::DotProduct(
-  const UpdatableComponent &other_in) const {
+    const UpdatableComponent &other_in) const {
   KALDI_ASSERT(is_updatable_);
   const ConstantFunctionComponent *other =
-    dynamic_cast<const ConstantFunctionComponent*>(&other_in);
+      dynamic_cast<const ConstantFunctionComponent*>(&other_in);
   KALDI_ASSERT(other != NULL);
   return VecVec(output_, other->output_);
 }
@@ -2524,7 +2522,7 @@ void ConstantFunctionComponent::InitFromConfig(ConfigLine *cfl) {
   int32 output_dim = 0;
   InitLearningRatesFromConfig(cfl);
   bool ok = cfl->GetValue("output-dim", &output_dim) &&
-            cfl->GetValue("input-dim", &input_dim_);
+      cfl->GetValue("input-dim", &input_dim_);
   cfl->GetValue("is-updatable", &is_updatable_);
   cfl->GetValue("use-natural-gradient", &use_natural_gradient_);
   BaseFloat output_mean = 0.0, output_stddev = 0.0;
@@ -2556,13 +2554,13 @@ void ConstantFunctionComponent::UnVectorize(const VectorBase<BaseFloat> &params)
 
 
 NaturalGradientAffineComponent::NaturalGradientAffineComponent():
-  max_change_per_sample_(0.0),
-  update_count_(0.0), active_scaling_count_(0.0),
-  max_change_scale_stats_(0.0) { }
+    max_change_per_sample_(0.0),
+    update_count_(0.0), active_scaling_count_(0.0),
+    max_change_scale_stats_(0.0) { }
 
 // virtual
 void NaturalGradientAffineComponent::Resize(
-  int32 input_dim, int32 output_dim) {
+    int32 input_dim, int32 output_dim) {
   KALDI_ASSERT(input_dim > 1 && output_dim > 1);
   if (rank_in_ >= input_dim) rank_in_ = input_dim - 1;
   if (rank_out_ >= output_dim) rank_out_ = output_dim - 1;
@@ -2616,9 +2614,9 @@ void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) {
   bool ok = true;
   std::string matrix_filename;
   BaseFloat num_samples_history = 2000.0, alpha = 4.0,
-            max_change_per_sample = 0.0;
+      max_change_per_sample = 0.0;
   int32 input_dim = -1, output_dim = -1, rank_in = 20, rank_out = 80,
-        update_period = 4;
+      update_period = 4;
   InitLearningRatesFromConfig(cfl);
   cfl->GetValue("num-samples-history", &num_samples_history);
   cfl->GetValue("alpha", &alpha);
@@ -2643,7 +2641,7 @@ void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) {
     if (!ok)
       KALDI_ERR << "Bad initializer " << cfl->WholeLine();
     BaseFloat param_stddev = 1.0 / std::sqrt(input_dim),
-              bias_stddev = 1.0, bias_mean = 0.0;
+        bias_stddev = 1.0, bias_mean = 0.0;
     cfl->GetValue("param-stddev", &param_stddev);
     cfl->GetValue("bias-stddev", &bias_stddev);
     cfl->GetValue("bias-mean", &bias_mean);
@@ -2670,10 +2668,10 @@ void NaturalGradientAffineComponent::SetNaturalGradientConfigs() {
 }
 
 void NaturalGradientAffineComponent::Init(
-  int32 rank_in, int32 rank_out,
-  int32 update_period, BaseFloat num_samples_history, BaseFloat alpha,
-  BaseFloat max_change_per_sample,
-  std::string matrix_filename) {
+    int32 rank_in, int32 rank_out,
+    int32 update_period, BaseFloat num_samples_history, BaseFloat alpha,
+    BaseFloat max_change_per_sample,
+    std::string matrix_filename) {
   rank_in_ = rank_in;
   rank_out_ = rank_out;
   update_period_ = update_period;
@@ -2697,11 +2695,11 @@ void NaturalGradientAffineComponent::Init(
 }
 
 void NaturalGradientAffineComponent::Init(
-  int32 input_dim, int32 output_dim,
-  BaseFloat param_stddev, BaseFloat bias_stddev, BaseFloat bias_mean,
-  int32 rank_in, int32 rank_out, int32 update_period,
-  BaseFloat num_samples_history, BaseFloat alpha,
-  BaseFloat max_change_per_sample) {
+    int32 input_dim, int32 output_dim,
+    BaseFloat param_stddev, BaseFloat bias_stddev, BaseFloat bias_mean,
+    int32 rank_in, int32 rank_out, int32 update_period,
+    BaseFloat num_samples_history, BaseFloat alpha,
+    BaseFloat max_change_per_sample) {
   linear_params_.Resize(output_dim, input_dim);
   bias_params_.Resize(output_dim);
   KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0 &&
@@ -2731,7 +2729,7 @@ void NaturalGradientAffineComponent::Init(
 }
 
 void NaturalGradientAffineComponent::Write(std::ostream &os,
-    bool binary) const {
+                                           bool binary) const {
   WriteUpdatableCommon(os, binary);  // Write the opening tag and learning rate
   WriteToken(os, binary, "<LinearParams>");
   linear_params_.Write(os, binary);
@@ -2784,26 +2782,26 @@ Component* NaturalGradientAffineComponent::Copy() const {
 }
 
 NaturalGradientAffineComponent::NaturalGradientAffineComponent(
-  const NaturalGradientAffineComponent &other):
-  AffineComponent(other),
-  rank_in_(other.rank_in_),
-  rank_out_(other.rank_out_),
-  update_period_(other.update_period_),
-  num_samples_history_(other.num_samples_history_),
-  alpha_(other.alpha_),
-  preconditioner_in_(other.preconditioner_in_),
-  preconditioner_out_(other.preconditioner_out_),
-  max_change_per_sample_(other.max_change_per_sample_),
-  update_count_(other.update_count_),
-  active_scaling_count_(other.active_scaling_count_),
-  max_change_scale_stats_(other.max_change_scale_stats_) {
+    const NaturalGradientAffineComponent &other):
+    AffineComponent(other),
+    rank_in_(other.rank_in_),
+    rank_out_(other.rank_out_),
+    update_period_(other.update_period_),
+    num_samples_history_(other.num_samples_history_),
+    alpha_(other.alpha_),
+    preconditioner_in_(other.preconditioner_in_),
+    preconditioner_out_(other.preconditioner_out_),
+    max_change_per_sample_(other.max_change_per_sample_),
+    update_count_(other.update_count_),
+    active_scaling_count_(other.active_scaling_count_),
+    max_change_scale_stats_(other.max_change_scale_stats_) {
   SetNaturalGradientConfigs();
 }
 
 void NaturalGradientAffineComponent::Update(
-  const std::string &debug_info,
-  const CuMatrixBase<BaseFloat> &in_value,
-  const CuMatrixBase<BaseFloat> &out_deriv) {
+    const std::string &debug_info,
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &out_deriv) {
   CuMatrix<BaseFloat> in_value_temp;
 
   in_value_temp.Resize(in_value.NumRows(),
@@ -2820,16 +2818,16 @@ void NaturalGradientAffineComponent::Update(
   CuMatrix<BaseFloat> row_products(2,
                                    in_value.NumRows());
   CuSubVector<BaseFloat> in_row_products(row_products, 0),
-              out_row_products(row_products, 1);
+      out_row_products(row_products, 1);
 
   // These "scale" values get will get multiplied into the learning rate (faster
   // than having the matrices scaled inside the preconditioning code).
   BaseFloat in_scale, out_scale;
 
   preconditioner_in_.PreconditionDirections(&in_value_temp, &in_row_products,
-      &in_scale);
+                                            &in_scale);
   preconditioner_out_.PreconditionDirections(&out_deriv_temp, &out_row_products,
-      &out_scale);
+                                             &out_scale);
 
   // "scale" is a scaling factor coming from the PreconditionDirections calls
   // (it's faster to have them output a scaling factor than to have them scale
@@ -2837,8 +2835,8 @@ void NaturalGradientAffineComponent::Update(
   BaseFloat scale = in_scale * out_scale;
 
   CuSubMatrix<BaseFloat> in_value_precon_part(in_value_temp,
-      0, in_value_temp.NumRows(),
-      0, in_value_temp.NumCols() - 1);
+                                              0, in_value_temp.NumRows(),
+                                              0, in_value_temp.NumCols() - 1);
   // this "precon_ones" is what happens to the vector of 1's representing
   // offsets, after multiplication by the preconditioner.
   CuVector<BaseFloat> precon_ones(in_value_temp.NumRows());
@@ -2869,7 +2867,7 @@ void NaturalGradientAffineComponent::Scale(BaseFloat scale) {
 
 void NaturalGradientAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
   const NaturalGradientAffineComponent *other =
-    dynamic_cast<const NaturalGradientAffineComponent*>(&other_in);
+      dynamic_cast<const NaturalGradientAffineComponent*>(&other_in);
   KALDI_ASSERT(other != NULL);
   update_count_ += alpha * other->update_count_;
   max_change_scale_stats_ += alpha * other->max_change_scale_stats_;
@@ -2923,8 +2921,8 @@ void FixedAffineComponent::InitFromConfig(ConfigLine *cfl) {
 
 
 FixedAffineComponent::FixedAffineComponent(const AffineComponent &c):
-  linear_params_(c.LinearParams()),
-  bias_params_(c.BiasParams()) { }
+    linear_params_(c.LinearParams()),
+    bias_params_(c.BiasParams()) { }
 
 void FixedAffineComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
                                      const CuMatrixBase<BaseFloat> &in,
@@ -3060,7 +3058,7 @@ void SumGroupComponent::GetSizes(std::vector<int32> *sizes) const {
   for (size_t i = 0; i < indexes.size(); i++) {
     (*sizes)[i] = indexes[i].second - indexes[i].first;
     if (i == 0) { KALDI_ASSERT(indexes[i].first == 0); }
-    else { KALDI_ASSERT(indexes[i].first == indexes[i - 1].second); }
+    else { KALDI_ASSERT(indexes[i].first == indexes[i-1].second); }
     KALDI_ASSERT(indexes[i].second > indexes[i].first);
     (*sizes)[i] = indexes[i].second - indexes[i].first;
   }
@@ -3303,7 +3301,7 @@ void FixedBiasComponent::Read(std::istream &is, bool binary) {
 
 
 void NaturalGradientPerElementScaleComponent::Read(
-  std::istream &is, bool binary) {
+    std::istream &is, bool binary) {
   ReadUpdatableCommon(is, binary);  // Read the opening tag and learning rate
   ExpectToken(is, binary, "<Params>");
   scales_.Read(is, binary);
@@ -3329,7 +3327,7 @@ void NaturalGradientPerElementScaleComponent::Read(
 }
 
 void NaturalGradientPerElementScaleComponent::Write(std::ostream &os,
-    bool binary) const {
+                                                    bool binary) const {
   WriteUpdatableCommon(os, binary);  // Write the opening tag and learning rate
   WriteToken(os, binary, "<Params>");
   scales_.Write(os, binary);
@@ -3362,15 +3360,15 @@ std::string NaturalGradientPerElementScaleComponent::Info() const {
 void NaturalGradientPerElementScaleComponent::InitFromConfig(ConfigLine *cfl) {
   // First set various configuration values that have defaults.
   int32 rank = 8,  // Use a small rank because in this case the amount of memory
-        // for the preconditioner actually exceeds the memory for the
-        // parameters (by "rank").
-        update_period = 10;
+                   // for the preconditioner actually exceeds the memory for the
+                   // parameters (by "rank").
+      update_period = 10;
   // the max_change_per_minibatch is the maximum amount of parameter-change, in 2-norm,
   // that we allow per minibatch; if change is greater than that, we scale down
   // the parameter-change.  It has the same purpose as the max-change-per-sample in
   // the NaturalGradientAffineComponent.
   BaseFloat num_samples_history = 2000.0, alpha = 4.0,
-            max_change_per_minibatch = 0.0;
+      max_change_per_minibatch = 0.0;
   cfl->GetValue("rank", &rank);
   cfl->GetValue("update-period", &update_period);
   cfl->GetValue("num-samples-history", &num_samples_history);
@@ -3402,10 +3400,10 @@ void NaturalGradientPerElementScaleComponent::InitFromConfig(ConfigLine *cfl) {
 }
 
 void NaturalGradientPerElementScaleComponent::Init(
-  int32 dim, BaseFloat param_mean,
-  BaseFloat param_stddev, int32 rank, int32 update_period,
-  BaseFloat num_samples_history, BaseFloat alpha,
-  BaseFloat max_change_per_minibatch) {
+    int32 dim, BaseFloat param_mean,
+    BaseFloat param_stddev, int32 rank, int32 update_period,
+    BaseFloat num_samples_history, BaseFloat alpha,
+    BaseFloat max_change_per_minibatch) {
   PerElementScaleComponent::Init(dim, param_mean,
                                  param_stddev);
   preconditioner_.SetRank(rank);
@@ -3421,9 +3419,9 @@ void NaturalGradientPerElementScaleComponent::Init(
 }
 
 void NaturalGradientPerElementScaleComponent::Init(
-  std::string vector_filename,
-  int32 rank, int32 update_period, BaseFloat num_samples_history,
-  BaseFloat alpha, BaseFloat max_change_per_minibatch) {
+    std::string vector_filename,
+    int32 rank, int32 update_period, BaseFloat num_samples_history,
+    BaseFloat alpha, BaseFloat max_change_per_minibatch) {
   PerElementScaleComponent::Init(vector_filename);
   preconditioner_.SetRank(rank);
   preconditioner_.SetUpdatePeriod(update_period);
@@ -3434,10 +3432,10 @@ void NaturalGradientPerElementScaleComponent::Init(
 
 
 NaturalGradientPerElementScaleComponent::NaturalGradientPerElementScaleComponent(
-  const NaturalGradientPerElementScaleComponent &other):
-  PerElementScaleComponent(other),
-  max_change_per_minibatch_(other.max_change_per_minibatch_),
-  preconditioner_(other.preconditioner_) { }
+    const NaturalGradientPerElementScaleComponent &other):
+    PerElementScaleComponent(other),
+    max_change_per_minibatch_(other.max_change_per_minibatch_),
+    preconditioner_(other.preconditioner_) { }
 
 
 
@@ -3447,9 +3445,9 @@ Component* NaturalGradientPerElementScaleComponent::Copy() const {
 }
 
 void NaturalGradientPerElementScaleComponent::Update(
-  const std::string &debug_info,
-  const CuMatrixBase<BaseFloat> &in_value,
-  const CuMatrixBase<BaseFloat> &out_deriv) {
+    const std::string &debug_info,
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &out_deriv) {
 
   CuMatrix<BaseFloat> derivs_per_frame(in_value);
   derivs_per_frame.MulElements(out_deriv);
@@ -3466,46 +3464,46 @@ void NaturalGradientPerElementScaleComponent::Update(
 
 // Constructors for the convolution component
 ConvolutionComponent::ConvolutionComponent():
-  UpdatableComponent(),
-  input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
-  filt_x_dim_(0), filt_y_dim_(0),
-  filt_x_step_(0), filt_y_step_(0),
-  input_vectorization_(kZyx),
-  is_gradient_(false) {}
+    UpdatableComponent(),
+    input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
+    filt_x_dim_(0), filt_y_dim_(0),
+    filt_x_step_(0), filt_y_step_(0),
+    input_vectorization_(kZyx),
+    is_gradient_(false) {}
 
 ConvolutionComponent::ConvolutionComponent(
-  const ConvolutionComponent &component):
-  UpdatableComponent(component),
-  input_x_dim_(component.input_x_dim_),
-  input_y_dim_(component.input_y_dim_),
-  input_z_dim_(component.input_z_dim_),
-  filt_x_dim_(component.filt_x_dim_),
-  filt_y_dim_(component.filt_y_dim_),
-  filt_x_step_(component.filt_x_step_),
-  filt_y_step_(component.filt_y_step_),
-  input_vectorization_(component.input_vectorization_),
-  filter_params_(component.filter_params_),
-  bias_params_(component.bias_params_),
-  is_gradient_(component.is_gradient_) {}
+    const ConvolutionComponent &component):
+    UpdatableComponent(component),
+    input_x_dim_(component.input_x_dim_),
+    input_y_dim_(component.input_y_dim_),
+    input_z_dim_(component.input_z_dim_),
+    filt_x_dim_(component.filt_x_dim_),
+    filt_y_dim_(component.filt_y_dim_),
+    filt_x_step_(component.filt_x_step_),
+    filt_y_step_(component.filt_y_step_),
+    input_vectorization_(component.input_vectorization_),
+    filter_params_(component.filter_params_),
+    bias_params_(component.bias_params_),
+    is_gradient_(component.is_gradient_) {}
 
 ConvolutionComponent::ConvolutionComponent(
-  const CuMatrixBase<BaseFloat> &filter_params,
-  const CuVectorBase<BaseFloat> &bias_params,
-  int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
-  int32 filt_x_dim, int32 filt_y_dim,
-  int32 filt_x_step, int32 filt_y_step,
-  TensorVectorizationType input_vectorization,
-  BaseFloat learning_rate):
-  input_x_dim_(input_x_dim),
-  input_y_dim_(input_y_dim),
-  input_z_dim_(input_z_dim),
-  filt_x_dim_(filt_x_dim),
-  filt_y_dim_(filt_y_dim),
-  filt_x_step_(filt_x_step),
-  filt_y_step_(filt_y_step),
-  input_vectorization_(input_vectorization),
-  filter_params_(filter_params),
-  bias_params_(bias_params) {
+    const CuMatrixBase<BaseFloat> &filter_params,
+    const CuVectorBase<BaseFloat> &bias_params,
+    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+    int32 filt_x_dim, int32 filt_y_dim,
+    int32 filt_x_step, int32 filt_y_step,
+    TensorVectorizationType input_vectorization,
+    BaseFloat learning_rate):
+    input_x_dim_(input_x_dim),
+    input_y_dim_(input_y_dim),
+    input_z_dim_(input_z_dim),
+    filt_x_dim_(filt_x_dim),
+    filt_y_dim_(filt_y_dim),
+    filt_x_step_(filt_x_step),
+    filt_y_step_(filt_y_step),
+    input_vectorization_(input_vectorization),
+    filter_params_(filter_params),
+    bias_params_(bias_params){
   KALDI_ASSERT(filter_params.NumRows() == bias_params.Dim() &&
                bias_params.Dim() != 0);
   KALDI_ASSERT(filter_params.NumCols() == filt_x_dim * filt_y_dim * input_z_dim);
@@ -3528,11 +3526,11 @@ int32 ConvolutionComponent::OutputDim() const {
 
 // initialize the component using hyperparameters
 void ConvolutionComponent::Init(
-  int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
-  int32 filt_x_dim, int32 filt_y_dim,
-  int32 filt_x_step, int32 filt_y_step, int32 num_filters,
-  TensorVectorizationType input_vectorization,
-  BaseFloat param_stddev, BaseFloat bias_stddev) {
+    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+    int32 filt_x_dim, int32 filt_y_dim,
+    int32 filt_x_step, int32 filt_y_step, int32 num_filters,
+    TensorVectorizationType input_vectorization,
+    BaseFloat param_stddev, BaseFloat bias_stddev) {
   input_x_dim_ = input_x_dim;
   input_y_dim_ = input_y_dim;
   input_z_dim_ = input_z_dim;
@@ -3555,11 +3553,11 @@ void ConvolutionComponent::Init(
 
 // initialize the component using predefined matrix file
 void ConvolutionComponent::Init(
-  int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
-  int32 filt_x_dim, int32 filt_y_dim,
-  int32 filt_x_step, int32 filt_y_step,
-  TensorVectorizationType input_vectorization,
-  std::string matrix_filename) {
+    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+    int32 filt_x_dim, int32 filt_y_dim,
+    int32 filt_x_step, int32 filt_y_step,
+    TensorVectorizationType input_vectorization,
+    std::string matrix_filename) {
   input_x_dim_ = input_x_dim;
   input_y_dim_ = input_y_dim;
   input_z_dim_ = input_z_dim;
@@ -3652,7 +3650,7 @@ void ConvolutionComponent::InitFromConfig(ConfigLine *cfl) {
   }
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
+	      << cfl->UnusedValues();
   if (!ok)
     KALDI_ERR << "Bad initializer " << cfl->WholeLine();
 }
@@ -3679,8 +3677,8 @@ inline int32 ZyxVectorIndex(int32 x, int32 y, int32 z,
 // 3D tensors to patches for convolution, each patch corresponds to
 // one dot product in the convolution
 void ConvolutionComponent::InputToInputPatches(
-  const CuMatrixBase<BaseFloat>& in,
-  CuMatrix<BaseFloat> *patches) const {
+    const CuMatrixBase<BaseFloat>& in,
+    CuMatrix<BaseFloat> *patches) const{
   int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_);
   int32 num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_);
   const int32 filt_x_step = filt_x_step_,
@@ -3709,9 +3707,9 @@ void ConvolutionComponent::InputToInputPatches(
                                                  input_z_dim);
             } else if (input_vectorization_ == kYzx)  {
               column_map[index] = YzxVectorIndex(x_step * filt_x_step + x,
-                                                 y_step * filt_y_step + y, z,
-                                                 input_x_dim, input_y_dim,
-                                                 input_z_dim);
+                                                  y_step * filt_y_step + y, z,
+                                                  input_x_dim, input_y_dim,
+                                                  input_z_dim);
             }
           }
         }
@@ -3726,8 +3724,8 @@ void ConvolutionComponent::InputToInputPatches(
 // propagation function
 // see function declaration in nnet-simple-component.h for details
 void ConvolutionComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
-                                     const CuMatrixBase<BaseFloat> &in,
-                                     CuMatrixBase<BaseFloat> *out) const {
+                                         const CuMatrixBase<BaseFloat> &in,
+                                         CuMatrixBase<BaseFloat> *out) const {
   const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
               num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
               num_filters = filter_params_.NumRows(),
@@ -3741,8 +3739,8 @@ void ConvolutionComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
                               kUndefined);
   InputToInputPatches(in, &patches);
   CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
-    filter_params_, 0, filter_params_.NumRows(), 0,
-    filter_params_.NumCols());
+		  filter_params_, 0, filter_params_.NumRows(), 0,
+		  filter_params_.NumCols());
   std::vector<CuSubMatrix<BaseFloat>* > tgt_batch, patch_batch,
       filter_params_batch;
 
@@ -3750,9 +3748,9 @@ void ConvolutionComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
     for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
       int32 patch_number = x_step * num_y_steps + y_step;
       tgt_batch.push_back(new CuSubMatrix<BaseFloat>(
-                            out->ColRange(patch_number * num_filters, num_filters)));
+              out->ColRange(patch_number * num_filters, num_filters)));
       patch_batch.push_back(new CuSubMatrix<BaseFloat>(
-                              patches.ColRange(patch_number * filter_dim, filter_dim)));
+              patches.ColRange(patch_number * filter_dim, filter_dim)));
       filter_params_batch.push_back(filter_params_elem);
       tgt_batch[patch_number]->AddVecToRows(1.0, bias_params_, 1.0); // add bias
     }
@@ -3778,7 +3776,7 @@ void ConvolutionComponent::Scale(BaseFloat scale) {
 // add another convolution component
 void ConvolutionComponent::Add(BaseFloat alpha, const Component &other_in) {
   const ConvolutionComponent *other =
-    dynamic_cast<const ConvolutionComponent*>(&other_in);
+      dynamic_cast<const ConvolutionComponent*>(&other_in);
   KALDI_ASSERT(other != NULL);
   filter_params_.AddMat(alpha, other->filter_params_);
   bias_params_.AddVec(alpha, other->bias_params_);
@@ -3796,7 +3794,7 @@ void ConvolutionComponent::Add(BaseFloat alpha, const Component &other_in) {
             where necessary if not all the input lists have the same side.
 */
 void RearrangeIndexes(const std::vector<std::vector<int32> > &in,
-                      std::vector<std::vector<int32> > *out) {
+                                                std::vector<std::vector<int32> > *out) {
   int32 D = in.size();
   int32 L = 0;
   for (int32 i = 0; i < D; i++)
@@ -3816,8 +3814,8 @@ void RearrangeIndexes(const std::vector<std::vector<int32> > &in,
 // for patches, where each patch corresponds to one dot product
 // in the convolution
 void ConvolutionComponent::InderivPatchesToInderiv(
-  const CuMatrix<BaseFloat>& in_deriv_patches,
-  CuMatrixBase<BaseFloat> *in_deriv) const {
+    const CuMatrix<BaseFloat>& in_deriv_patches,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
 
   const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
               num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
@@ -3879,7 +3877,7 @@ void ConvolutionComponent::Backprop(const std::string &debug_info,
                                     Component *to_update_in,
                                     CuMatrixBase<BaseFloat> *in_deriv) const {
   ConvolutionComponent *to_update =
-    dynamic_cast<ConvolutionComponent*>(to_update_in);
+      dynamic_cast<ConvolutionComponent*>(to_update_in);
   const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
               num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
               num_filters = filter_params_.NumRows(),
@@ -3896,20 +3894,20 @@ void ConvolutionComponent::Backprop(const std::string &debug_info,
                                        kSetZero);
 
   std::vector<CuSubMatrix<BaseFloat>* > patch_deriv_batch, out_deriv_batch,
-      filter_params_batch;
+	  filter_params_batch;
   CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
-    filter_params_, 0, filter_params_.NumRows(), 0,
-    filter_params_.NumCols());
+		  filter_params_, 0, filter_params_.NumRows(), 0,
+		  filter_params_.NumCols());
 
   for (int32 x_step = 0; x_step < num_x_steps; x_step++)  {
     for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
       int32 patch_number = x_step * num_y_steps + y_step;
 
       patch_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(
-                                    in_deriv_patches.ColRange(
-                                      patch_number * filter_dim, filter_dim)));
+              in_deriv_patches.ColRange(
+              patch_number * filter_dim, filter_dim)));
       out_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(out_deriv.ColRange(
-                                  patch_number * num_filters, num_filters)));
+              patch_number * num_filters, num_filters)));
       filter_params_batch.push_back(filter_params_elem);
     }
   }
@@ -3967,8 +3965,8 @@ void ConvolutionComponent::Update(const std::string &debug_info,
   // create a single large matrix holding the smaller matrices
   // from the vector container filters_grad_batch along the rows
   CuMatrix<BaseFloat> filters_grad_blocks_batch(
-    num_x_steps * num_y_steps * filters_grad.NumRows(),
-    filters_grad.NumCols());
+      num_x_steps * num_y_steps * filters_grad.NumRows(),
+      filters_grad.NumCols());
 
   std::vector<CuSubMatrix<BaseFloat>* > filters_grad_batch, input_patch_batch;
 
@@ -3976,12 +3974,12 @@ void ConvolutionComponent::Update(const std::string &debug_info,
     for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
       int32 patch_number = x_step * num_y_steps + y_step;
       filters_grad_batch.push_back(new CuSubMatrix<BaseFloat>(
-                                     filters_grad_blocks_batch.RowRange(
-                                       patch_number * filters_grad.NumRows(),
-                                       filters_grad.NumRows())));
+              filters_grad_blocks_batch.RowRange(
+				      patch_number * filters_grad.NumRows(),
+				    filters_grad.NumRows())));
 
       input_patch_batch.push_back(new CuSubMatrix<BaseFloat>(
-                                    input_patches.ColRange(patch_number * filter_dim, filter_dim)));
+              input_patches.ColRange(patch_number * filter_dim, filter_dim)));
     }
   }
 
@@ -3993,7 +3991,7 @@ void ConvolutionComponent::Update(const std::string &debug_info,
 
   // create a matrix holding the col blocks sum of out_deriv
   CuMatrix<BaseFloat> out_deriv_col_blocks_sum(out_deriv.NumRows(),
-      num_filters);
+                                               num_filters);
 
   // add the col blocks together to out_deriv_col_blocks_sum
   out_deriv_col_blocks_sum.AddMatBlocks(1.0, out_deriv);
@@ -4086,7 +4084,7 @@ void ConvolutionComponent::Write(std::ostream &os, bool binary) const {
 
 BaseFloat ConvolutionComponent::DotProduct(const UpdatableComponent &other_in) const {
   const ConvolutionComponent *other =
-    dynamic_cast<const ConvolutionComponent*>(&other_in);
+      dynamic_cast<const ConvolutionComponent*>(&other_in);
   return TraceMatMat(filter_params_, other->filter_params_, kTrans)
          + VecVec(bias_params_, other->bias_params_);
 }
@@ -4136,16 +4134,16 @@ int32 MaxpoolingComponent::InputDim() const {
 }
 
 MaxpoolingComponent::MaxpoolingComponent(
-  const MaxpoolingComponent &component):
-  input_x_dim_(component.input_x_dim_),
-  input_y_dim_(component.input_y_dim_),
-  input_z_dim_(component.input_z_dim_),
-  pool_x_size_(component.pool_x_size_),
-  pool_y_size_(component.pool_y_size_),
-  pool_z_size_(component.pool_z_size_),
-  pool_x_step_(component.pool_x_step_),
-  pool_y_step_(component.pool_y_step_),
-  pool_z_step_(component.pool_z_step_) { }
+    const MaxpoolingComponent &component):
+    input_x_dim_(component.input_x_dim_),
+    input_y_dim_(component.input_y_dim_),
+    input_z_dim_(component.input_z_dim_),
+    pool_x_size_(component.pool_x_size_),
+    pool_y_size_(component.pool_y_size_),
+    pool_z_size_(component.pool_z_size_),
+    pool_x_step_(component.pool_x_step_),
+    pool_y_step_(component.pool_y_step_),
+    pool_z_step_(component.pool_z_step_) { }
 
 // aquire output dim
 int32 MaxpoolingComponent::OutputDim() const {
@@ -4205,15 +4203,15 @@ void MaxpoolingComponent::InitFromConfig(ConfigLine *cfl) {
 // 3D tensors to patches for 3d max pooling, each patch corresponds to
 // the nodes having the same local coordinatenodes from each pool
 void MaxpoolingComponent::InputToInputPatches(
-  const CuMatrixBase<BaseFloat>& in,
-  CuMatrix<BaseFloat> *patches) const {
+    const CuMatrixBase<BaseFloat>& in,
+    CuMatrix<BaseFloat> *patches) const{
   int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
   int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
   int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
 
   std::vector<int32> column_map(patches->NumCols());
   int32 column_map_size = column_map.size();
-  for (int32 x = 0, index = 0; x < pool_x_size_; x++) {
+  for (int32 x = 0, index =0; x < pool_x_size_; x++) {
     for (int32 y = 0; y < pool_y_size_; y++) {
       for (int32 z = 0; z < pool_z_size_; z++) {
         // given the local node coordinate, group them from each pool
@@ -4265,8 +4263,8 @@ void MaxpoolingComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
 // for patches, where each patch corresponds to
 // the nodes having the same local coordinatenodes from each pool
 void MaxpoolingComponent::InderivPatchesToInderiv(
-  const CuMatrix<BaseFloat>& in_deriv_patches,
-  CuMatrixBase<BaseFloat> *in_deriv) const {
+    const CuMatrix<BaseFloat>& in_deriv_patches,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
 
   int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
   int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
@@ -4282,8 +4280,8 @@ void MaxpoolingComponent::InderivPatchesToInderiv(
           for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) {
             for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) {
               int32 vector_index = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ +
-                                   (y_pool * pool_y_step_ + y) * input_z_dim_ +
-                                   (z_pool * pool_z_step_ + z);
+                                  (y_pool * pool_y_step_ + y) * input_z_dim_ +
+                                  (z_pool * pool_z_step_ + z);
 
               KALDI_ASSERT(vector_index < rev_col_map_size);
               reverse_column_map[vector_index].push_back(index);
@@ -4450,7 +4448,7 @@ void PermuteComponent::InitFromConfig(ConfigLine *cfl) {
               << column_map_str;
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
+	      << cfl->UnusedValues();
   if (!ok)
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
@@ -4516,7 +4514,7 @@ std::string PermuteComponent::Info() const {
 
 bool CompositeComponent::IsUpdatable() const {
   for (std::vector<Component*>::const_iterator iter = components_.begin(),
-       end = components_.end(); iter != end; ++iter)
+           end = components_.end(); iter != end; ++iter)
     if (((*iter)->Properties() & kUpdatableComponent) != 0)
       return true;
   return false;
@@ -4538,16 +4536,16 @@ int32 CompositeComponent::OutputDim() const {
 int32 CompositeComponent::Properties() const {
   KALDI_ASSERT(!components_.empty());
   int32 last_component_properties = components_.back()->Properties(),
-        first_component_properties = components_.front()->Properties();
+      first_component_properties = components_.front()->Properties();
   // We always assume backprop needs the input, as this would be necessary to
   // get the activations at intermediate layers, if these were not needed in
   // backprop, there would be no reason to use a CompositeComponent.
   int32 ans = kSimpleComponent | kBackpropNeedsInput |
-              (last_component_properties &
-               (kPropagateAdds | kBackpropNeedsOutput | kOutputContiguous)) |
-              (first_component_properties &
-               (kBackpropAdds | kInputContiguous)) |
-              (IsUpdatable() ? kUpdatableComponent : 0);
+      (last_component_properties &
+       (kPropagateAdds|kBackpropNeedsOutput|kOutputContiguous)) |
+       (first_component_properties &
+        (kBackpropAdds|kInputContiguous)) |
+       (IsUpdatable() ? kUpdatableComponent : 0);
   // note, we don't return the kStoresStats property because that function is
   // not implemented; instead, for efficiency, we call StoreStats() on any
   // sub-components as part of the backprop phase.
@@ -4570,13 +4568,13 @@ MatrixStrideType CompositeComponent::GetStrideType(int32 i) const {
 
 // virtual
 void CompositeComponent::Propagate(
-  const ComponentPrecomputedIndexes *, // indexes
-  const CuMatrixBase<BaseFloat> &in,
-  CuMatrixBase<BaseFloat> *out) const {
+    const ComponentPrecomputedIndexes *, // indexes
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
   KALDI_ASSERT(in.NumRows() == out->NumRows() && in.NumCols() == InputDim() &&
                out->NumCols() == OutputDim());
   int32 num_rows = in.NumRows(),
-        num_components = components_.size();
+      num_components = components_.size();
   if (max_rows_process_ > 0 && num_rows > max_rows_process_) {
     // recurse and process smaller parts of the data, to save memory.
     for (int32 row_offset = 0; row_offset < num_rows;
@@ -4595,15 +4593,15 @@ void CompositeComponent::Propagate(
   for (int32 i = 0; i < num_components; i++) {
     if (i + 1 < num_components) {
       MatrixResizeType resize_type =
-        ((components_[i]->Properties() & kPropagateAdds) ?
-         kSetZero : kUndefined);
+          ((components_[i]->Properties() & kPropagateAdds) ?
+           kSetZero : kUndefined);
       intermediate_outputs[i].Resize(num_rows, components_[i]->OutputDim(),
                                      resize_type, GetStrideType(i));
     }
-    components_[i]->Propagate(NULL, (i == 0 ? in : intermediate_outputs[i - 1]),
-                              (i + 1 == num_components ? out : & (intermediate_outputs[i])));
+    components_[i]->Propagate(NULL, (i == 0 ? in : intermediate_outputs[i-1]),
+               (i + 1 == num_components ? out : &(intermediate_outputs[i])));
     if (i > 0)
-      intermediate_outputs[i - 1].Resize(0, 0);
+      intermediate_outputs[i-1].Resize(0, 0);
   }
 }
 
@@ -4621,7 +4619,7 @@ void CompositeComponent::Init(const std::vector<Component*> &components,
     if (i > 0) {
       // make sure all the internal dimensions match up.
       KALDI_ASSERT(components_[i]->InputDim() ==
-                   components_[i - 1]->OutputDim());
+                   components_[i-1]->OutputDim());
     }
   }
 }
@@ -4683,7 +4681,7 @@ void CompositeComponent::ZeroStats() {
   // will do nothing if the component doesn't store stats.  (components like
   // ReLU and sigmoid and tanh store stats on activations).
   for (size_t i = 0; i < components_.size(); i++)
-    components_[i]->ZeroStats();
+   components_[i]->ZeroStats();
 }
 
 // virtual
@@ -4712,7 +4710,7 @@ void CompositeComponent::Backprop(const std::string &debug_info,
                in_value.NumCols() == InputDim() &&
                out_deriv.NumCols() == OutputDim());
   int32 num_rows = in_value.NumRows(),
-        num_components = components_.size();
+      num_components = components_.size();
   if (max_rows_process_ > 0 && num_rows > max_rows_process_) {
     KALDI_ASSERT(max_rows_process_ > 0);
     // recurse and process smaller parts of the data, to save memory.
@@ -4724,18 +4722,18 @@ void CompositeComponent::Backprop(const std::string &debug_info,
       // out_value_part will only be used if out_value is nonempty; otherwise we
       // make it a submatrix of 'out_deriv' to avoid errors in the constructor.
       const CuSubMatrix<BaseFloat> out_value_part(have_output_value ? out_value : out_deriv,
-          row_offset, this_num_rows,
-          0, out_deriv.NumCols());
+                                                  row_offset, this_num_rows,
+                                                  0, out_deriv.NumCols());
       // in_deriv_value_part will only be used if in_deriv != NULL; otherwise we
       // make it a submatrix of 'in_value' to avoid errors in the constructor.
       CuSubMatrix<BaseFloat> in_deriv_part(in_deriv != NULL ? *in_deriv : in_value,
-                                           row_offset, this_num_rows,
-                                           0, in_value.NumCols());
+                                            row_offset, this_num_rows,
+                                            0, in_value.NumCols());
       CuSubMatrix<BaseFloat> in_value_part(in_value, row_offset, this_num_rows,
                                            0, in_value.NumCols());
       const CuSubMatrix<BaseFloat> out_deriv_part(out_deriv,
-          row_offset, this_num_rows,
-          0, out_deriv.NumCols());
+                                                  row_offset, this_num_rows,
+                                                  0, out_deriv.NumCols());
       CuMatrix<BaseFloat>  empty_mat;
       this->Backprop(debug_info, NULL, in_value_part,
                      (have_output_value ? static_cast<const CuMatrixBase<BaseFloat>&>(out_value_part) :
@@ -4761,27 +4759,27 @@ void CompositeComponent::Backprop(const std::string &debug_info,
     // backprop doesn't need the input and the one previous to that doesn't
     // need the output.  [lowest hanging fruit for optimization]
     if (i + 2 == num_components &&
-        !(components_[i + 1]->Properties() & kBackpropNeedsInput) &&
+        !(components_[i+1]->Properties() & kBackpropNeedsInput) &&
         !(components_[i]->Properties() & kBackpropNeedsOutput))
       break;
     MatrixResizeType resize_type =
-      ((components_[i]->Properties() & kPropagateAdds) ?
-       kSetZero : kUndefined);
+        ((components_[i]->Properties() & kPropagateAdds) ?
+         kSetZero : kUndefined);
     intermediate_outputs[i].Resize(num_rows, components_[i]->OutputDim(),
                                    resize_type, GetStrideType(i));
     components_[i]->Propagate(NULL,
-                              (i == 0 ? in_value : intermediate_outputs[i - 1]),
+                              (i == 0 ? in_value : intermediate_outputs[i-1]),
                               &(intermediate_outputs[i]));
   }
   for (int32 i = num_components - 1; i >= 0; i--) {
     Component *component_to_update =
-      (to_update == NULL ? NULL :
-       dynamic_cast<CompositeComponent*>(to_update)->components_[i]);
+        (to_update == NULL ? NULL :
+         dynamic_cast<CompositeComponent*>(to_update)->components_[i]);
 
     if (components_[i]->Properties() & kStoresStats &&
         component_to_update != NULL)
       component_to_update->StoreStats(
-        (i + 1 == num_components ? out_value : intermediate_outputs[i]));
+          (i + 1 == num_components ? out_value : intermediate_outputs[i]));
 
     // skip the first component's backprop if it's not updatable and in_deriv is
     // not requested.  Again, this is the lowest-hanging fruit to optimize.
@@ -4790,17 +4788,17 @@ void CompositeComponent::Backprop(const std::string &debug_info,
       break;
     if (i > 0) {
       MatrixResizeType resize_type =
-        ((components_[i]->Properties() & kBackpropAdds) ?
-         kSetZero : kUndefined);
-      intermediate_derivs[i - 1].Resize(num_rows, components_[i]->InputDim(),
-                                        resize_type, GetStrideType(i - 1));
+          ((components_[i]->Properties() & kBackpropAdds) ?
+           kSetZero : kUndefined);
+      intermediate_derivs[i-1].Resize(num_rows, components_[i]->InputDim(),
+                                      resize_type, GetStrideType(i - 1));
     }
     components_[i]->Backprop(debug_info, NULL,
-                             (i == 0 ? in_value : intermediate_outputs[i - 1]),
+                             (i == 0 ? in_value : intermediate_outputs[i-1]),
                              (i + 1 == num_components ? out_value : intermediate_outputs[i]),
                              (i + 1 == num_components ? out_deriv : intermediate_derivs[i]),
                              component_to_update,
-                             (i == 0 ? in_deriv : & (intermediate_derivs[i - 1])));
+                             (i == 0 ? in_deriv : &(intermediate_derivs[i-1])));
   }
 }
 
@@ -4811,7 +4809,7 @@ std::string CompositeComponent::Info() const {
   stream << Type() << " ";
   for (size_t i = 0; i < components_.size(); i++) {
     if (i > 0) stream << ", ";
-    stream << "sub-component" << (i + 1) << " = { "
+    stream << "sub-component" << (i+1) << " = { "
            << components_[i]->Info() << " }";
   }
   return stream.str();
@@ -4826,7 +4824,7 @@ void CompositeComponent::Scale(BaseFloat scale) {
 // virtual
 void CompositeComponent::Add(BaseFloat alpha, const Component &other_in) {
   const CompositeComponent *other = dynamic_cast<const CompositeComponent*>(
-                                      &other_in);
+      &other_in);
   KALDI_ASSERT(other != NULL && other->components_.size() ==
                components_.size() && "Mismatching nnet topologies");
   for (size_t i = 0; i < components_.size(); i++)
@@ -4839,7 +4837,7 @@ void CompositeComponent::SetZero(bool treat_as_gradient) {
   for (size_t i = 0; i < components_.size(); i++) {
     if (components_[i]->Properties() & kUpdatableComponent) {
       UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(components_[i]);
+          dynamic_cast<UpdatableComponent*>(components_[i]);
       uc->SetZero(treat_as_gradient);
     }
   }
@@ -4851,7 +4849,7 @@ void CompositeComponent::PerturbParams(BaseFloat stddev) {
   for (size_t i = 0; i < components_.size(); i++) {
     if (components_[i]->Properties() & kUpdatableComponent) {
       UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(components_[i]);
+          dynamic_cast<UpdatableComponent*>(components_[i]);
       uc->PerturbParams(stddev);
     }
   }
@@ -4867,7 +4865,7 @@ void CompositeComponent::SetUnderlyingLearningRate(BaseFloat lrate) {
   for (size_t i = 0; i < components_.size(); i++) {
     if (components_[i]->Properties() & kUpdatableComponent) {
       UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(components_[i]);
+          dynamic_cast<UpdatableComponent*>(components_[i]);
       uc->SetUnderlyingLearningRate(effective_lrate);
     }
   }
@@ -4879,7 +4877,7 @@ void CompositeComponent::SetActualLearningRate(BaseFloat lrate) {
   for (size_t i = 0; i < components_.size(); i++) {
     if (components_[i]->Properties() & kUpdatableComponent) {
       UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(components_[i]);
+          dynamic_cast<UpdatableComponent*>(components_[i]);
       uc->SetActualLearningRate(lrate);
     }
   }
@@ -4892,7 +4890,7 @@ int32 CompositeComponent::NumParameters() const {
   for (size_t i = 0; i < components_.size(); i++) {
     if (components_[i]->Properties() & kUpdatableComponent) {
       UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(components_[i]);
+          dynamic_cast<UpdatableComponent*>(components_[i]);
       ans += uc->NumParameters();
     }
   }
@@ -4906,7 +4904,7 @@ void CompositeComponent::Vectorize(VectorBase<BaseFloat> *params) const {
   for (size_t i = 0; i < components_.size(); i++) {
     if (components_[i]->Properties() & kUpdatableComponent) {
       UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(components_[i]);
+          dynamic_cast<UpdatableComponent*>(components_[i]);
       int32 this_size = uc->NumParameters();
       SubVector<BaseFloat> params_range(*params, cur_offset, this_size);
       uc->Vectorize(&params_range);
@@ -4923,7 +4921,7 @@ void CompositeComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
   for (size_t i = 0; i < components_.size(); i++) {
     if (components_[i]->Properties() & kUpdatableComponent) {
       UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(components_[i]);
+          dynamic_cast<UpdatableComponent*>(components_[i]);
       int32 this_size = uc->NumParameters();
       SubVector<BaseFloat> params_range(params, cur_offset, this_size);
       uc->UnVectorize(params_range);
@@ -4935,18 +4933,18 @@ void CompositeComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
 
 // virtual
 BaseFloat CompositeComponent::DotProduct(
-  const UpdatableComponent &other_in) const {
+    const UpdatableComponent &other_in) const {
   const CompositeComponent *other = dynamic_cast<const CompositeComponent*>(
-                                      &other_in);
+      &other_in);
   KALDI_ASSERT(other != NULL && other->components_.size() ==
                components_.size() && "Mismatching nnet topologies");
   BaseFloat ans = 0.0;
   for (size_t i = 0.0; i < components_.size(); i++) {
     if (components_[i]->Properties() & kUpdatableComponent) {
       UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(components_[i]);
+          dynamic_cast<UpdatableComponent*>(components_[i]);
       const UpdatableComponent *uc_other =
-        dynamic_cast<UpdatableComponent*>(other->components_[i]);
+          dynamic_cast<UpdatableComponent*>(other->components_[i]);
       KALDI_ASSERT(uc != NULL && uc_other != NULL);
       ans += uc->DotProduct(*uc_other);
     }
@@ -4996,7 +4994,7 @@ void CompositeComponent::InitFromConfig(ConfigLine *cfl) {
                 << "(or undefined or bad component type [type=xxx]), in "
                 << "CompositeComponent config line '" << cfl->WholeLine() << "'";
     }
-    if (this_component->Type() == "CompositeComponent") {
+    if(this_component->Type() == "CompositeComponent") {
       DeletePointers(&components);
       delete this_component;
       KALDI_ERR << "Found CompositeComponent nested within CompositeComponent."
@@ -5119,8 +5117,7 @@ std::string LstmNonlinearityComponent::Info() const {
            << std::setprecision(6);
   }
   static const char *nonlin_names[] = { "i_t_sigmoid", "f_t_sigmoid", "c_t_tanh",
-                                        "o_t_sigmoid", "m_t_tanh"
-                                      };
+                                        "o_t_sigmoid", "m_t_tanh" };
   for (int32 i = 0; i < 5; i++) {
     stream << ", " << nonlin_names[i] << "={";
     stream << " self-repair-lower-threshold=" << self_repair_config_(i)
@@ -5128,10 +5125,10 @@ std::string LstmNonlinearityComponent::Info() const {
 
     if (count_ != 0) {
       BaseFloat self_repaired_proportion =
-        self_repair_total_(i) / (count_ * cell_dim);
+          self_repair_total_(i) / (count_ * cell_dim);
       stream << ", self-repaired-proportion=" << self_repaired_proportion;
       Vector<double> value_sum(value_sum_.Row(i)),
-             deriv_sum(deriv_sum_.Row(i));
+          deriv_sum(deriv_sum_.Row(i));
       Vector<BaseFloat> value_avg(value_sum), deriv_avg(deriv_sum);
       value_avg.Scale(1.0 / count_);
       deriv_avg.Scale(1.0 / count_);
@@ -5159,7 +5156,7 @@ void LstmNonlinearityComponent::Scale(BaseFloat scale) {
 void LstmNonlinearityComponent::Add(BaseFloat alpha,
                                     const Component &other_in) {
   const LstmNonlinearityComponent *other =
-    dynamic_cast<const LstmNonlinearityComponent*>(&other_in);
+      dynamic_cast<const LstmNonlinearityComponent*>(&other_in);
   KALDI_ASSERT(other != NULL);
   params_.AddMat(alpha, other->params_);
   value_sum_.AddMat(alpha, other->value_sum_);
@@ -5187,9 +5184,9 @@ void LstmNonlinearityComponent::PerturbParams(BaseFloat stddev) {
 }
 
 BaseFloat LstmNonlinearityComponent::DotProduct(
-  const UpdatableComponent &other_in) const {
+    const UpdatableComponent &other_in) const {
   const LstmNonlinearityComponent *other =
-    dynamic_cast<const LstmNonlinearityComponent*>(&other_in);
+      dynamic_cast<const LstmNonlinearityComponent*>(&other_in);
   KALDI_ASSERT(other != NULL);
   return TraceMatMat(params_, other->params_, kTrans);
 }
@@ -5205,28 +5202,28 @@ void LstmNonlinearityComponent::Vectorize(VectorBase<BaseFloat> *params) const {
 
 
 void LstmNonlinearityComponent::UnVectorize(
-  const VectorBase<BaseFloat> &params)  {
+    const VectorBase<BaseFloat> &params)  {
   KALDI_ASSERT(params.Dim() == NumParameters());
   params_.CopyRowsFromVec(params);
 }
 
 
 void LstmNonlinearityComponent::Propagate(
-  const ComponentPrecomputedIndexes *, // indexes
-  const CuMatrixBase<BaseFloat> &in,
-  CuMatrixBase<BaseFloat> *out) const {
+    const ComponentPrecomputedIndexes *, // indexes
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
   cu::ComputeLstmNonlinearity(in, params_, out);
 }
 
 
 void LstmNonlinearityComponent::Backprop(
-  const std::string &debug_info,
-  const ComponentPrecomputedIndexes *indexes,
-  const CuMatrixBase<BaseFloat> &in_value,
-  const CuMatrixBase<BaseFloat> &, // out_value,
-  const CuMatrixBase<BaseFloat> &out_deriv,
-  Component *to_update_in,
-  CuMatrixBase<BaseFloat> *in_deriv) const {
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &, // out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    Component *to_update_in,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
 
   if (to_update_in == NULL) {
     cu::BackpropLstmNonlinearity(in_value, params_, out_deriv,
@@ -5238,7 +5235,7 @@ void LstmNonlinearityComponent::Backprop(
                                  (CuMatrixBase<BaseFloat>*) NULL);
   } else {
     LstmNonlinearityComponent *to_update =
-      dynamic_cast<LstmNonlinearityComponent*>(to_update_in);
+        dynamic_cast<LstmNonlinearityComponent*>(to_update_in);
     KALDI_ASSERT(to_update != NULL);
 
     int32 cell_dim = params_.NumCols();
@@ -5260,7 +5257,7 @@ void LstmNonlinearityComponent::Backprop(
     BaseFloat scale = 1.0;
     if (!to_update->is_gradient_) {
       to_update->preconditioner_.PreconditionDirections(
-        &params_deriv, NULL, &scale);
+          &params_deriv, NULL, &scale);
     }
     to_update->params_.AddMat(to_update->learning_rate_ * scale,
                               params_deriv);
@@ -5268,21 +5265,21 @@ void LstmNonlinearityComponent::Backprop(
 }
 
 LstmNonlinearityComponent::LstmNonlinearityComponent(
-  const LstmNonlinearityComponent &other):
-  UpdatableComponent(other),
-  params_(other.params_),
-  value_sum_(other.value_sum_),
-  deriv_sum_(other.deriv_sum_),
-  self_repair_config_(other.self_repair_config_),
-  self_repair_total_(other.self_repair_total_),
-  count_(other.count_),
-  preconditioner_(other.preconditioner_) { }
+    const LstmNonlinearityComponent &other):
+    UpdatableComponent(other),
+    params_(other.params_),
+    value_sum_(other.value_sum_),
+    deriv_sum_(other.deriv_sum_),
+    self_repair_config_(other.self_repair_config_),
+    self_repair_total_(other.self_repair_total_),
+    count_(other.count_),
+    preconditioner_(other.preconditioner_) { }
 
 void LstmNonlinearityComponent::Init(
-  int32 cell_dim, BaseFloat param_stddev,
-  BaseFloat tanh_self_repair_threshold,
-  BaseFloat sigmoid_self_repair_threshold,
-  BaseFloat self_repair_scale) {
+    int32 cell_dim, BaseFloat param_stddev,
+    BaseFloat tanh_self_repair_threshold,
+    BaseFloat sigmoid_self_repair_threshold,
+    BaseFloat self_repair_scale) {
   KALDI_ASSERT(cell_dim > 0 && param_stddev >= 0.0 &&
                tanh_self_repair_threshold >= 0.0 &&
                tanh_self_repair_threshold <= 1.0 &&
@@ -5329,8 +5326,8 @@ void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
   // self-repair config values for the individual sigmoid and tanh
   // nonlinearities, we can modify this code then.
   BaseFloat tanh_self_repair_threshold = 0.2,
-            sigmoid_self_repair_threshold = 0.05,
-            self_repair_scale = 1.0e-05;
+      sigmoid_self_repair_threshold = 0.05,
+      self_repair_scale = 1.0e-05;
   // param_stddev is the stddev of the parameters.  it may be better to
   // use a smaller value but this was the default in the python scripts
   // for a while.
@@ -5349,7 +5346,7 @@ void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
 
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
+	      << cfl->UnusedValues();
   if (!ok)
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";