From e86d92c4a29209cea65fda6140c26156863c7f1f Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 20 Mar 2018 01:48:03 +0000
Subject: [PATCH 001/135] Test input a graph.

---
 src/operator/nn/control_flow.cc | 62 +++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 src/operator/nn/control_flow.cc
diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
new file mode 100644
index 000000000000..47c66c37b20b
--- /dev/null
+++ b/src/operator/nn/control_flow.cc
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <mxnet/io.h>
+#include <mxnet/base.h>
+#include <mxnet/ndarray.h>
+#include <mxnet/operator.h>
+#include <mxnet/operator_util.h>
+#include <dmlc/logging.h>
+#include <dmlc/optional.h>
+
+namespace mxnet {
+namespace op {
+
+static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,
+                                const OpContext& ctx,
+                                const std::vector<NDArray>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<NDArray>& outputs) {
+  CHECK(attrs.g != nullptr);
+}
+
+NNVM_REGISTER_OP(Foreach)
+.set_num_inputs(3)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"fn", "data1", "data2"};
+})
+.set_attr<nnvm::FInputGraph>("FInputGraph",
+    [](const NodeAttrs& attrs) {
+  return 0;
+})
+//.set_attr<nnvm::FInferShape>("FInferShape", ConvolutionShape)
+//.set_attr<nnvm::FInferType>("FInferType", ConvolutionType)
+.describe(R"code(test)code" ADD_FILELINE)
+//.set_attr_parser(ParamParser<ActivationParam>)
+//.set_attr<FInferStorageType>("FInferStorageType", ActivationStorageType)
+.set_attr<FComputeEx>("FComputeEx<cpu>", ForeachComputeExCPU)
+.add_argument("fn", "Symbol", "Input graph.")
+.add_argument("data1", "NDArray-or-Symbol", "Input1.")
+.add_argument("data2", "NDArray-or-Symbol", "Input2.");
+//.add_arguments(ActivationParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet

From 7eacfabf5b75597b40207b93419afca5e0978300 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 21 Mar 2018 12:05:25 -0700
Subject: [PATCH 002/135] Update foreach to execute the subgraph.

---
 include/mxnet/imperative.h      |  22 +--
 src/operator/nn/control_flow.cc | 237 +++++++++++++++++++++++++++++++-
 2 files changed, 242 insertions(+), 17 deletions(-)

diff --git a/include/mxnet/imperative.h b/include/mxnet/imperative.h
index 7ea60df33028..d50c1c371428 100644
--- a/include/mxnet/imperative.h
+++ b/include/mxnet/imperative.h
@@ -105,18 +105,18 @@ class Imperative {
                 std::vector<bool>* p_save_inputs = nullptr,
                 std::vector<bool>* p_save_outputs = nullptr);
   /*! \brief */
-  OpStatePtr Invoke(const Context& default_ctx,
-                    const nnvm::NodeAttrs& attrs,
-                    const std::vector<NDArray*>& inputs,
-                    const std::vector<NDArray*>& outputs);
+  static OpStatePtr Invoke(const Context& default_ctx,
+                           const nnvm::NodeAttrs& attrs,
+                           const std::vector<NDArray*>& inputs,
+                           const std::vector<NDArray*>& outputs);
   /*! \brief */
-  OpStatePtr InvokeOp(const Context& ctx,
-                      const nnvm::NodeAttrs& attrs,
-                      const std::vector<NDArray*>& inputs,
-                      const std::vector<NDArray*>& outputs,
-                      const std::vector<OpReqType>& req,
-                      const DispatchMode dispatch_mode,
-                      OpStatePtr state = OpStatePtr());
+  static OpStatePtr InvokeOp(const Context& ctx,
+                             const nnvm::NodeAttrs& attrs,
+                             const std::vector<NDArray*>& inputs,
+                             const std::vector<NDArray*>& outputs,
+                             const std::vector<OpReqType>& req,
+                             const DispatchMode dispatch_mode,
+                             OpStatePtr state = OpStatePtr());
   /*! \brief mark variables for computing gradients. */
   void MarkVariables(const std::vector<NDArray*>& variables,
                      const std::vector<mx_uint>& grad_reqs,
diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 47c66c37b20b..15a375d7f9b7 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -24,19 +24,247 @@
 #include <mxnet/operator_util.h>
 #include <dmlc/logging.h>
 #include <dmlc/optional.h>
+#include "../operator_common.h"
+#include "../../imperative/imperative_utils.h"
 
 namespace mxnet {
 namespace op {
 
+void RunGraph(const nnvm::IndexedGraph& idx,
+    const std::vector<NDArray*> arrays,
+    size_t node_start, size_t node_end,
+    std::vector<OpReqType>&& array_reqs,
+    std::vector<uint32_t>&& ref_count,
+    std::vector<OpStatePtr> *p_states,
+    const DispatchModeVector &dispatch_modes) {
+  using namespace nnvm;
+  using namespace imperative;
+  static auto& createop = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
+  static auto& is_layer_backward = Op::GetAttr<bool>("TIsLayerOpBackward");
+
+  std::vector<OpStatePtr>& states = *p_states;
+  std::vector<NDArray*> ndinputs, ndoutputs;
+  ShapeVector arg_shapes;
+  DTypeVector arg_dtypes;
+  std::vector<OpReqType> req;
+
+  for (size_t i = node_start; i < node_end; ++i) {
+    const nnvm::IndexedGraph::Node& node = idx[i];
+    if (node.source->op() == nullptr) continue;
+    auto num_outputs = node.source->num_outputs();
+    ndinputs.clear();
+    ndinputs.reserve(node.inputs.size());
+    for (const auto& j : node.inputs) {
+      ndinputs.emplace_back(arrays[idx.entry_id(j)]);
+      CHECK(!ndinputs.back()->is_none()) << idx[j.node_id].source->attrs.name << " " << j.index;
+    }
+    ndoutputs.clear();
+    ndoutputs.reserve(num_outputs);
+    req.clear();
+    req.reserve(num_outputs);
+    for (size_t j = 0; j < num_outputs; ++j) {
+      size_t eid = idx.entry_id(i, j);
+      ndoutputs.emplace_back(arrays[eid]);
+      req.push_back(array_reqs[eid]);
+      CHECK(!ndoutputs.back()->is_none());
+    }
+    const Context& ctx = ndoutputs[0]->ctx();
+    const DispatchMode dispatch_mode = dispatch_modes[i];
+    if (createop.count(node.source->op())) {
+      arg_shapes.clear();
+      arg_dtypes.clear();
+      arg_shapes.reserve(ndinputs.size());
+      arg_dtypes.reserve(ndinputs.size());
+      for (size_t i = 0; i < ndinputs.size(); ++i) {
+        arg_shapes.emplace_back(ndinputs[i]->shape());
+        arg_dtypes.emplace_back(ndinputs[i]->dtype());
+      }
+      states[i] = createop[node.source->op()](
+          node.source->attrs, ctx, arg_shapes, arg_dtypes);
+      Imperative::InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs, req,
+                           dispatch_mode, states[i]);
+    } else if (is_layer_backward.get(node.source->op(), false)) {
+      nnvm::Node* fwd_node = node.source->control_deps[0].get();
+      auto fwd_node_id = idx.node_id(fwd_node);
+      Imperative::InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs,
+                           req, dispatch_mode, states[fwd_node_id]);
+    } else {
+      Imperative::InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs,
+                           req, dispatch_mode);
+    }
+  }
+}
+
+static void ExecSubgraph(nnvm::Graph &g, const OpContext& ctx,
+                         const std::vector<NDArray>& cinputs,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<NDArray>& coutputs) {
+  using namespace nnvm;
+  using namespace imperative;
+  const auto& idx = g.indexed_graph();
+  size_t num_inputs = idx.input_nodes().size();
+
+  CHECK_EQ(num_inputs, cinputs.size())
+      << "The subgraph requires " << num_inputs << " but got " << cinputs.size();
+
+  Context default_ctx = cinputs[0].ctx();
+  for (size_t i = 0; i < cinputs.size(); ++i) {
+    CHECK_EQ(cinputs[i].ctx(), default_ctx)
+        << "The subgraph requires all inputs to live on the same context. But "
+        << idx[idx.input_nodes()[0]].source->attrs.name << " is on " << default_ctx
+        << " while " << idx[idx.input_nodes()[i]].source->attrs.name << " is on "
+        << cinputs[i].ctx();
+  }
+
+  // TODO(zhengda) we might want to buffer them.
+  std::vector<NDArray> buff;
+  std::vector<OpStatePtr> states;
+  std::vector<NDArray> inputs = cinputs;
+  std::vector<NDArray> outputs = coutputs;
+
+  // Allocate entries
+  states.resize(idx.num_nodes());
+  buff.resize(idx.num_node_entries());
+  states.reserve(idx.num_nodes());
+  std::vector<NDArray*> arrays;
+  arrays.reserve(buff.size());
+  for (size_t i = 0; i < buff.size(); ++i) arrays.push_back(&buff[i]);
+  for (size_t i = 0; i < num_inputs; ++i) {
+    arrays[idx.entry_id(idx.input_nodes()[i], 0)] = &inputs[i];
+  }
+  for (size_t i = 0; i < idx.outputs().size(); ++i) {
+    auto eid = idx.entry_id(idx.outputs()[i]);
+    if (!arrays[eid]->is_none()) outputs[i] = arrays[eid]->Detach();
+    arrays[eid] = &outputs[i];
+  }
+
+  // Allocate memory for the NDArrays
+  std::vector<uint32_t> ref_count = g.GetAttr<std::vector<uint32_t> >(
+      ctx.is_train ? "full_ref_count" : "forward_ref_count");
+
+  std::vector<OpReqType> array_reqs(arrays.size(), kWriteTo);
+  for (size_t i = 0; i < idx.num_node_entries(); ++i) {
+    if (ref_count[i] == 0) array_reqs[i] = kNullOp;
+  }
+
+  const auto& mem_plan = g.GetAttr<MemoryPlanVector>(
+      ctx.is_train ? "full_mem_plan" : "forward_mem_plan");
+  AllocateMemory(g, idx, default_ctx, 0, idx.num_node_entries(),
+                 mem_plan, arrays, &array_reqs);
+
+  const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
+
+  RunGraph(idx, arrays, 0, idx.num_nodes(), std::move(array_reqs),
+      std::move(ref_count), &states, dispatch_modes);
+}
+
 static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,
                                 const OpContext& ctx,
                                 const std::vector<NDArray>& inputs,
                                 const std::vector<OpReqType>& req,
                                 const std::vector<NDArray>& outputs) {
   CHECK(attrs.g != nullptr);
+  nnvm::Graph &g = *attrs.g;
+
+  printf("test1\n");
+  // If this is inference, we only need the forward memory plan.
+  bool has_mem_plan = !ctx.is_train && g.attrs.count("forward_mem_plan");
+  printf("test2\n");
+  // If this is training, we need the full memory plan.
+  has_mem_plan = has_mem_plan || (ctx.is_train && g.attrs.count("full_mem_plan"));
+  printf("test3\n");
+  // If we don't have a memory plan yet, we need to create a memory plan.
+  if (!has_mem_plan) {
+    const auto& idx = g.indexed_graph();
+    nnvm::StorageVector storage(idx.num_node_entries(), exec::kBadStorageID);
+    for (const auto i : idx.input_nodes())
+      storage[idx.entry_id(i, 0)] = exec::kExternalStorageID;
+    printf("test4\n");
+    const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
+    printf("test5\n");
+    CHECK_EQ(stypes.size(), storage.size());
+    for (size_t i = 0; i < stypes.size(); i++) {
+      if (stypes[i] != kDefaultStorage)
+        storage[i] = exec::kDynamicStorageID;
+    }
+
+    auto mem_plan = imperative::PlanMemory(
+        &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >(
+          ctx.is_train ? "full_ref_count" : "forward_ref_count"));
+    printf("test6\n");
+    // TODO(zhengda) we need to be careful of changing graph attributes.
+    // It's not thread-safe.
+    g.attrs[ctx.is_train ? "full_mem_plan" : "forward_mem_plan"]
+      = std::make_shared<dmlc::any>(std::move(mem_plan));
+    printf("test7\n");
+  }
+  printf("test8\n");
+  ExecSubgraph(g, ctx, inputs, req, outputs);
+}
+
+static bool ForeachShape(const nnvm::NodeAttrs& attrs,
+                         std::vector<TShape> *in_shape,
+                         std::vector<TShape> *out_shape) {
+  nnvm::ShapeVector shape_inputs = *in_shape;
+  auto g = attrs.g;
+  CHECK(g);
+  // TODO(zhengda) This can also be called in the execution engine.
+  // We need to make it thread-safe.
+  imperative::CheckAndInferShape(g.get(), std::move(shape_inputs), true);
+  const auto& shapes = g->GetAttr<nnvm::ShapeVector>("shape");
+  CHECK(g->outputs.size() == 1);
+  uint32_t eid = g->indexed_graph().entry_id(g->outputs[0]);
+  (*out_shape)[0] = shapes[eid];
+  return true;
+}
+
+static bool ForeachType(const nnvm::NodeAttrs& attrs,
+                        std::vector<int> *in_type, std::vector<int> *out_type) {
+  nnvm::DTypeVector dtype_inputs = *in_type;
+  auto g = attrs.g;
+  CHECK(g);
+  // TODO(zhengda) This can also be called in the execution engine.
+  // We need to make it thread-safe.
+  imperative::CheckAndInferType(g.get(), std::move(dtype_inputs), true);
+  const auto &dtypes = g->GetAttr<nnvm::DTypeVector>("dtype");
+  CHECK(g->outputs.size() == 1);
+  uint32_t eid = g->indexed_graph().entry_id(g->outputs[0]);
+  (*out_type)[0] = dtypes[eid];
+  return true;
+}
+
+static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
+                               const int dev_mask,
+                               DispatchMode* dispatch_mode,
+                               std::vector<int> *in_attrs,
+                               std::vector<int> *out_attrs) {
+  auto g = attrs.g;
+  CHECK(g);
+  printf("test1\n");
+  const auto& idx = g->indexed_graph();
+  CHECK(idx.input_nodes().size() == in_attrs->size());
+  exec::DevMaskVector dev_masks(idx.num_nodes(), dev_mask);
+  StorageTypeVector &storage_type_inputs = *in_attrs;
+  printf("test2\n");
+  imperative::CheckAndInferStorageType(g.get(), std::move(dev_masks),
+                                       std::move(storage_type_inputs), true);
+  printf("test3\n");
+  *dispatch_mode = DispatchMode::kFComputeEx;
+  const auto& stypes = g->GetAttr<StorageTypeVector>("storage_type");
+  auto &outputs = idx.outputs();
+  CHECK(outputs.size() == out_attrs->size());
+  printf("test4\n");
+  for (size_t i = 0; i < out_attrs->size(); i++) {
+    (*out_attrs)[i] = stypes[idx.entry_id(outputs[i])];
+  }
+  printf("test5\n");
+  return true;
 }
 
 NNVM_REGISTER_OP(Foreach)
+.describe(R"code(Foreach)code" ADD_FILELINE)
+//.set_attr_parser(ParamParser<ForeachParam>)
+.set_attr<FInferStorageType>("FInferStorageType", ForeachStorageType)
 .set_num_inputs(3)
 .set_num_outputs(1)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
@@ -47,16 +275,13 @@ NNVM_REGISTER_OP(Foreach)
     [](const NodeAttrs& attrs) {
   return 0;
 })
-//.set_attr<nnvm::FInferShape>("FInferShape", ConvolutionShape)
-//.set_attr<nnvm::FInferType>("FInferType", ConvolutionType)
-.describe(R"code(test)code" ADD_FILELINE)
-//.set_attr_parser(ParamParser<ActivationParam>)
-//.set_attr<FInferStorageType>("FInferStorageType", ActivationStorageType)
+.set_attr<nnvm::FInferShape>("FInferShape", ForeachShape)
+.set_attr<nnvm::FInferType>("FInferType", ForeachType)
 .set_attr<FComputeEx>("FComputeEx<cpu>", ForeachComputeExCPU)
 .add_argument("fn", "Symbol", "Input graph.")
 .add_argument("data1", "NDArray-or-Symbol", "Input1.")
 .add_argument("data2", "NDArray-or-Symbol", "Input2.");
-//.add_arguments(ActivationParam::__FIELDS__());
+//.add_arguments(ForeachParam::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet

From 6874f7d21f3bad414315fa4f0bb5611ac876516f Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 21 Mar 2018 21:55:27 +0000
Subject: [PATCH 003/135] print inputs/outputs in foreach.

---
 src/operator/nn/control_flow.cc | 38 +++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 15a375d7f9b7..3abe6baa50ce 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -95,6 +95,21 @@ void RunGraph(const nnvm::IndexedGraph& idx,
   }
 }
 
+static inline void print_dims(const mxnet::NDArray &arr, const std::string name = "") {
+  printf("%s: ", name.c_str());
+  for (size_t i = 0; i < arr.shape().ndim(); i++)
+    printf("%ld, ", arr.shape()[i]);
+  printf("\n");
+}
+
+static inline void print(const mxnet::NDArray &arr, const std::string name = "") {
+  print_dims(arr, name);
+  float *data = (float *) arr.data().dptr_;
+  for (size_t i = 0; i < arr.shape().Size(); i++)
+    printf("%f, ", data[i]);
+  printf("\n");
+}
+
 static void ExecSubgraph(nnvm::Graph &g, const OpContext& ctx,
                          const std::vector<NDArray>& cinputs,
                          const std::vector<OpReqType>& req,
@@ -151,11 +166,19 @@ static void ExecSubgraph(nnvm::Graph &g, const OpContext& ctx,
       ctx.is_train ? "full_mem_plan" : "forward_mem_plan");
   AllocateMemory(g, idx, default_ctx, 0, idx.num_node_entries(),
                  mem_plan, arrays, &array_reqs);
+  print(inputs[0], "data1");
+  print(inputs[1], "data2");
+  print(outputs[0], "output");
 
   const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
 
   RunGraph(idx, arrays, 0, idx.num_nodes(), std::move(array_reqs),
       std::move(ref_count), &states, dispatch_modes);
+
+  printf("After running graph\n");
+  print(inputs[0], "data1");
+  print(inputs[1], "data2");
+  print(outputs[0], "output");
 }
 
 static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,
@@ -166,22 +189,17 @@ static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,
   CHECK(attrs.g != nullptr);
   nnvm::Graph &g = *attrs.g;
 
-  printf("test1\n");
   // If this is inference, we only need the forward memory plan.
   bool has_mem_plan = !ctx.is_train && g.attrs.count("forward_mem_plan");
-  printf("test2\n");
   // If this is training, we need the full memory plan.
   has_mem_plan = has_mem_plan || (ctx.is_train && g.attrs.count("full_mem_plan"));
-  printf("test3\n");
   // If we don't have a memory plan yet, we need to create a memory plan.
   if (!has_mem_plan) {
     const auto& idx = g.indexed_graph();
     nnvm::StorageVector storage(idx.num_node_entries(), exec::kBadStorageID);
     for (const auto i : idx.input_nodes())
       storage[idx.entry_id(i, 0)] = exec::kExternalStorageID;
-    printf("test4\n");
     const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
-    printf("test5\n");
     CHECK_EQ(stypes.size(), storage.size());
     for (size_t i = 0; i < stypes.size(); i++) {
       if (stypes[i] != kDefaultStorage)
@@ -191,14 +209,11 @@ static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,
     auto mem_plan = imperative::PlanMemory(
         &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >(
           ctx.is_train ? "full_ref_count" : "forward_ref_count"));
-    printf("test6\n");
     // TODO(zhengda) we need to be careful of changing graph attributes.
     // It's not thread-safe.
     g.attrs[ctx.is_train ? "full_mem_plan" : "forward_mem_plan"]
       = std::make_shared<dmlc::any>(std::move(mem_plan));
-    printf("test7\n");
   }
-  printf("test8\n");
   ExecSubgraph(g, ctx, inputs, req, outputs);
 }
 
@@ -240,24 +255,19 @@ static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
                                std::vector<int> *out_attrs) {
   auto g = attrs.g;
   CHECK(g);
-  printf("test1\n");
   const auto& idx = g->indexed_graph();
   CHECK(idx.input_nodes().size() == in_attrs->size());
   exec::DevMaskVector dev_masks(idx.num_nodes(), dev_mask);
-  StorageTypeVector &storage_type_inputs = *in_attrs;
-  printf("test2\n");
+  StorageTypeVector storage_type_inputs = *in_attrs;
   imperative::CheckAndInferStorageType(g.get(), std::move(dev_masks),
                                        std::move(storage_type_inputs), true);
-  printf("test3\n");
   *dispatch_mode = DispatchMode::kFComputeEx;
   const auto& stypes = g->GetAttr<StorageTypeVector>("storage_type");
   auto &outputs = idx.outputs();
   CHECK(outputs.size() == out_attrs->size());
-  printf("test4\n");
   for (size_t i = 0; i < out_attrs->size(); i++) {
     (*out_attrs)[i] = stypes[idx.entry_id(outputs[i])];
   }
-  printf("test5\n");
   return true;
 }
 

From 8f5e62e173026abc0be8327dc164071176caab4b Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 23 Mar 2018 22:08:53 +0000
Subject: [PATCH 004/135] Remove print.

---
 src/operator/nn/control_flow.cc | 27 ++-------------------------
 1 file changed, 2 insertions(+), 25 deletions(-)

diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 3abe6baa50ce..614de336e938 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -56,7 +56,8 @@ void RunGraph(const nnvm::IndexedGraph& idx,
     ndinputs.reserve(node.inputs.size());
     for (const auto& j : node.inputs) {
       ndinputs.emplace_back(arrays[idx.entry_id(j)]);
-      CHECK(!ndinputs.back()->is_none()) << idx[j.node_id].source->attrs.name << " " << j.index;
+      CHECK(!ndinputs.back()->is_none()) << idx[j.node_id].source->attrs.name
+          << " " << j.index;
     }
     ndoutputs.clear();
     ndoutputs.reserve(num_outputs);
@@ -95,21 +96,6 @@ void RunGraph(const nnvm::IndexedGraph& idx,
   }
 }
 
-static inline void print_dims(const mxnet::NDArray &arr, const std::string name = "") {
-  printf("%s: ", name.c_str());
-  for (size_t i = 0; i < arr.shape().ndim(); i++)
-    printf("%ld, ", arr.shape()[i]);
-  printf("\n");
-}
-
-static inline void print(const mxnet::NDArray &arr, const std::string name = "") {
-  print_dims(arr, name);
-  float *data = (float *) arr.data().dptr_;
-  for (size_t i = 0; i < arr.shape().Size(); i++)
-    printf("%f, ", data[i]);
-  printf("\n");
-}
-
 static void ExecSubgraph(nnvm::Graph &g, const OpContext& ctx,
                          const std::vector<NDArray>& cinputs,
                          const std::vector<OpReqType>& req,
@@ -166,19 +152,10 @@ static void ExecSubgraph(nnvm::Graph &g, const OpContext& ctx,
       ctx.is_train ? "full_mem_plan" : "forward_mem_plan");
   AllocateMemory(g, idx, default_ctx, 0, idx.num_node_entries(),
                  mem_plan, arrays, &array_reqs);
-  print(inputs[0], "data1");
-  print(inputs[1], "data2");
-  print(outputs[0], "output");
 
   const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
-
   RunGraph(idx, arrays, 0, idx.num_nodes(), std::move(array_reqs),
       std::move(ref_count), &states, dispatch_modes);
-
-  printf("After running graph\n");
-  print(inputs[0], "data1");
-  print(inputs[1], "data2");
-  print(outputs[0], "output");
 }
 
 static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,

From 1dd35a8e806696cb7f98e1e743b7764270b597bc Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 23 Mar 2018 22:09:23 +0000
Subject: [PATCH 005/135] add test code for foreach.

---
 tests/python/unittest/test_operator.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 461fb63514c1..8a42c7392a3d 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5937,6 +5937,24 @@ def test_float16_min_max():
     assert np.finfo('float16').max == mx.nd.max(a).asscalar()
 
 
+@with_seed()
+def test_foreach():
+    v1 = mx.sym.var("v1")
+    v2 = mx.sym.var("v2")
+    v3 = mx.sym.var("v3")
+    v4 = mx.sym.var("v4")
+    g = v1 + v2
+    op = mx.sym.Foreach(g, v3, v4)
+    arr1 = mx.nd.random.uniform(shape=(5, 2))
+    arr2 = mx.nd.random.uniform(shape=(5, 2))
+    e = op.bind(ctx=mx.cpu(), args={'v3': arr1, 'v4': arr2})
+    e.forward()
+    for y in e.outputs:
+        y.wait_to_read()
+        print(y)
+        print(arr1 + arr2)
+
+
 @with_seed()
 def test_squeeze_op():
     def check_squeeze_op(shape, axis=None):

From 1f117cb03e08fd8af8250b9e7e8691ce74cc5b0e Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Thu, 5 Apr 2018 21:37:27 +0000
Subject: [PATCH 006/135] exec foreach outside the engine.

---
 src/executor/attach_op_execs_pass.cc   |  8 ++++++++
 src/executor/exec_pass.h               |  3 +++
 src/executor/graph_executor.cc         | 24 +++++++++++++++++++++---
 tests/python/unittest/test_operator.py |  5 +++--
 4 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index 72919d90c620..eb0d0052e5f7 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -207,6 +207,10 @@ class FComputeExecutor : public StorageFallbackOpExecutor {
     return exec_type_;
   }
 
+  bool HasSubgraph() const override {
+    return attrs_.g != nullptr;
+  }
+
   explicit FComputeExecutor(const NodeAttrs& attrs, FCompute fcompute,
                             ExecType exec_type, const std::vector<uint32_t> &mutate_idx)
       : StorageFallbackOpExecutor(mutate_idx),
@@ -232,6 +236,10 @@ class FComputeExExecutor : public OpExecutor {
 
   void Setup() override {}
 
+  bool HasSubgraph() const override {
+    return attrs_.g != nullptr;
+  }
+
   ExecType exec_type() const override {
     return exec_type_;
   }
diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h
index 26a249118940..6f4a5611c1bf 100644
--- a/src/executor/exec_pass.h
+++ b/src/executor/exec_pass.h
@@ -64,6 +64,9 @@ class OpExecutor {
   OpContext op_ctx;
   /*! \brief virtual destructor */
   virtual ~OpExecutor() {}
+  virtual bool HasSubgraph() const {
+    return false;
+  }
   /*!
    * \brief Setup the executor for given NDArray member
    * this can be called multiple times if NDArray changed during reshape.
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 831b5f900237..399581697b1d 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -1489,7 +1489,11 @@ void GraphExecutor::BulkTrainingOpSegs(size_t total_num_nodes) {
     // check if the segment relies on external input, or exceeds maxinum number of node,
     // or requires async ops
     if (node->is_variable() || nid - topo_start > num_nodes_threshold ||
-        op_node.exec->exec_type() != ExecType::kSync) {
+        op_node.exec->exec_type() != ExecType::kSync ||
+        // If the node has a subgraph, we shouldn't add it to the segment.
+        // We'll execute the node separately from other nodes.
+        // CreateCachedSegOpr creates a segment excluding nodes with subgraphs.
+        op_node.exec->HasSubgraph()) {
       // create a new segment for the previous nodes if the current one cannot be bulked
       cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, nid);
       topo_start = nid + 1;
@@ -1514,7 +1518,11 @@ void GraphExecutor::BulkTrainingOpSegs(size_t total_num_nodes) {
       continue;
     }
     if (idx[nid].source->is_variable() || nid - topo_start > num_nodes_threshold ||
-        op_node.exec->exec_type() != ExecType::kSync) {
+        op_node.exec->exec_type() != ExecType::kSync ||
+        // If the node has a subgraph, we shouldn't add it to the segment.
+        // We'll execute the node separately from other nodes.
+        // CreateCachedSegOpr creates a segment excluding nodes with subgraphs.
+        op_node.exec->HasSubgraph()) {
       cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, nid);
       topo_start = nid + 1;
     } else {
@@ -1548,7 +1556,11 @@ void GraphExecutor::BulkInferenceOpSegs() {
     // Variables do not need to be segmented at inference time.
     if (node->is_variable()) continue;
 
-    if (op_node.exec->exec_type() != ExecType::kSync) {
+    if (op_node.exec->exec_type() != ExecType::kSync ||
+        // If the node has a subgraph, we shouldn't add it to the segment.
+        // We'll execute the node separately from other nodes.
+        // CreateCachedSegOpr creates a segment excluding nodes with subgraphs.
+        op_node.exec->HasSubgraph()) {
       cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, nid);
       topo_start = nid + 1;
     }
@@ -1614,6 +1626,9 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
       CHECK_EQ(opnode.exec->in_array.size(), 1U);
       CHECK_EQ(opnode.exec->out_array.size(), 1U);
       CopyFromTo(opnode.exec->in_array[0], &(opnode.exec->out_array[0]));
+    } else if (opnode.exec->HasSubgraph()) {
+      // If the node contains a subgraph, we can't execute it in the engine.
+      opnode.exec->Run(opnode.exec->op_ctx.run_ctx, false);
     } else if (opnode.cached_opr != nullptr) {
       bool profiling = profiler::Profiler::Get()->GetState() == profiler::Profiler::kRunning;
       Engine::Get()->Push(opnode.cached_opr, opnode.ctx, 0, profiling);
@@ -1648,6 +1663,9 @@ GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start,
     OpNode& op_node = op_nodes_[nid];
     if (op_node.skip_exec_node) continue;
     if (inode.source->is_variable()) continue;
+    // We shouldn't add control flow operators to a segment.
+    // We can't execute these operators in the engine.
+    if (op_node.exec->HasSubgraph()) continue;
     if (op_node.exec->exec_type() != ExecType::kSync) {
       return ret;
     }
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 8a42c7392a3d..23f19bfed32d 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5944,10 +5944,11 @@ def test_foreach():
     v3 = mx.sym.var("v3")
     v4 = mx.sym.var("v4")
     g = v1 + v2
-    op = mx.sym.Foreach(g, v3, v4)
+    out = mx.sym.Foreach(g, v3, v4)
+    out = out * 2
     arr1 = mx.nd.random.uniform(shape=(5, 2))
     arr2 = mx.nd.random.uniform(shape=(5, 2))
-    e = op.bind(ctx=mx.cpu(), args={'v3': arr1, 'v4': arr2})
+    e = out.bind(ctx=mx.cpu(), args={'v3': arr1, 'v4': arr2})
     e.forward()
     for y in e.outputs:
         y.wait_to_read()

From cc29fc13b124e8df195d1c3a4586495358c613d3 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Thu, 5 Apr 2018 23:05:51 +0000
Subject: [PATCH 007/135] Implements forward of foreach.

---
 src/operator/nn/control_flow.cc        | 41 ++++++++++++++++++++++++--
 tests/python/unittest/test_operator.py | 16 ++++++----
 2 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 614de336e938..5226290cd937 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -191,13 +191,45 @@ static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,
     g.attrs[ctx.is_train ? "full_mem_plan" : "forward_mem_plan"]
       = std::make_shared<dmlc::any>(std::move(mem_plan));
   }
-  ExecSubgraph(g, ctx, inputs, req, outputs);
+  size_t len = inputs[0].shape()[0];
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
+  std::vector<NDArray> subg_inputs(inputs.size());
+  std::vector<NDArray> subg_outputs(outputs.size());
+  for (size_t i = 1; i < inputs.size(); i++)
+    subg_inputs[i] = inputs[i];
+  // Here we iterate over the first dimension of the first input array.
+  for (size_t i = 0; i < len; i++) {
+    subg_inputs[0] = inputs[0].At(i);
+    // For the first iteration, the second argument is the second input array,
+    // i.e., the initial state.
+    if (i == 0)
+      subg_inputs[1] = inputs[1];
+    else
+      // For the rest of the iterations, the second argument is the output from
+      // the previous iteration.
+      subg_inputs[1] = subg_outputs[0];
+    subg_outputs[0] = outputs[0].At(i);
+
+    ExecSubgraph(g, ctx, subg_inputs, req, subg_outputs);
+    // We need to wait for the iteration to complete before executing
+    // the next one or return from the loop. In this way, we can reuse
+    // the memory in the subgraph.
+    for (size_t j = 0; j < subg_outputs.size(); j++)
+      subg_outputs[j].WaitToRead();
+  }
 }
 
 static bool ForeachShape(const nnvm::NodeAttrs& attrs,
                          std::vector<TShape> *in_shape,
                          std::vector<TShape> *out_shape) {
+  CHECK_EQ(in_shape->size(), 2U);
   nnvm::ShapeVector shape_inputs = *in_shape;
+  // foreach iterates over the first input NDArray over the first dimension.
+  shape_inputs[0] = TShape(in_shape->at(0).begin() + 1, in_shape->at(0).end());
+  bool ret = shape_assign(&shape_inputs[1], shape_inputs[0]);
+  CHECK(ret);
   auto g = attrs.g;
   CHECK(g);
   // TODO(zhengda) This can also be called in the execution engine.
@@ -206,7 +238,12 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
   const auto& shapes = g->GetAttr<nnvm::ShapeVector>("shape");
   CHECK(g->outputs.size() == 1);
   uint32_t eid = g->indexed_graph().entry_id(g->outputs[0]);
-  (*out_shape)[0] = shapes[eid];
+  const auto& g_out_shape = shapes[eid];
+  const auto& in0 = (*in_shape)[0];
+  CHECK_EQ(g_out_shape.ndim() + 1, in0.ndim());
+  for (size_t i = 1; i < in0.ndim(); i++)
+    CHECK_EQ(in0[i], g_out_shape[i - 1]);
+  (*out_shape)[0] = in0;
   return true;
 }
 
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 23f19bfed32d..900ce940aaff 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5947,13 +5947,19 @@ def test_foreach():
     out = mx.sym.Foreach(g, v3, v4)
     out = out * 2
     arr1 = mx.nd.random.uniform(shape=(5, 2))
-    arr2 = mx.nd.random.uniform(shape=(5, 2))
+    arr2 = mx.nd.random.uniform(shape=(2))
     e = out.bind(ctx=mx.cpu(), args={'v3': arr1, 'v4': arr2})
     e.forward()
-    for y in e.outputs:
-        y.wait_to_read()
-        print(y)
-        print(arr1 + arr2)
+    arr1 = arr1.asnumpy()
+    arr2 = arr2.asnumpy()
+    np_res = np.zeros_like(arr1)
+    for i in range(arr1.shape[0]):
+        if (i == 0):
+            np_res[i] = arr2 + arr1[i]
+        else:
+            np_res[i] = np_res[i - 1] + arr1[i]
+    np_res = np_res * 2
+    assert_almost_equal(e.outputs[0].asnumpy(), np_res, rtol=0.001, atol=0.0001)
 
 
 @with_seed()

From 036aada1cb851af6e05ea3c9815b4b2f4d4b8e6e Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 6 Apr 2018 01:12:22 +0000
Subject: [PATCH 008/135] Add support for variable numbers of inputs and
 outputs.

---
 src/operator/nn/control_flow.cc        | 131 ++++++++++++++++++-------
 tests/python/unittest/test_operator.py |   5 +-
 2 files changed, 99 insertions(+), 37 deletions(-)

diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 5226290cd937..0ff7201e03bc 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -158,6 +158,19 @@ static void ExecSubgraph(nnvm::Graph &g, const OpContext& ctx,
       std::move(ref_count), &states, dispatch_modes);
 }
 
+struct ForeachParam : public dmlc::Parameter<ForeachParam> {
+  int num_args;
+  int dim;
+  DMLC_DECLARE_PARAMETER(ForeachParam) {
+    DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
+    .describe("Number of inputs.");
+    DMLC_DECLARE_FIELD(dim).set_default(1)
+    .describe("the dimension of the input array to iterate.");
+  }
+};  // struct ForeachParam
+
+DMLC_REGISTER_PARAMETER(ForeachParam);
+
 static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,
                                 const OpContext& ctx,
                                 const std::vector<NDArray>& inputs,
@@ -192,58 +205,95 @@ static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,
       = std::make_shared<dmlc::any>(std::move(mem_plan));
   }
   size_t len = inputs[0].shape()[0];
-  CHECK_EQ(outputs.size(), 1U);
-  CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
+
+  // Initialize the inputs for the subgraph.
   std::vector<NDArray> subg_inputs(inputs.size());
-  std::vector<NDArray> subg_outputs(outputs.size());
-  for (size_t i = 1; i < inputs.size(); i++)
+  for (size_t i = 1; i < inputs.size(); i++) {
+    // These are the initial states.
     subg_inputs[i] = inputs[i];
+  }
+
+  // Initialize the outputs of the subgraph is a little trickier.
+  // The states from the previous iteration are used as the inputs of the next
+  // iteration, so I have to maintain two arrays, so the inputs and outputs
+  // of the subgraph share the same memory.
+  std::vector<NDArray> subg_outputs1(inputs.size());
+  std::vector<NDArray> subg_outputs2(inputs.size());
+  std::vector<NDArray> *subg_outputs[2]{&subg_outputs1, &subg_outputs2};
+  // If the length is an odd number, the last iteration will use the first set
+  // of outputs. In this way, we don't need to copy the results from the
+  // subgraph to the final outputs of the loop.
+  if (len % 2 == 1) {
+    for (size_t i = 1; i < subg_outputs1.size(); i++) {
+      subg_outputs1[i] = outputs[i];
+      subg_outputs2[i] = NDArray(outputs[i].shape(), outputs[i].ctx(), false,
+                                 outputs[i].dtype());
+    }
+  } else {
+    // Otherwise, we'll use the second set of outputs.
+    for (size_t i = 1; i < subg_outputs1.size(); i++) {
+      subg_outputs1[i] = NDArray(outputs[i].shape(), outputs[i].ctx(), false,
+                                 outputs[i].dtype());
+      subg_outputs2[i] = outputs[i];
+    }
+  }
+
   // Here we iterate over the first dimension of the first input array.
   for (size_t i = 0; i < len; i++) {
+    std::vector<NDArray> *subg_out_curr = subg_outputs[i % 2];
+    std::vector<NDArray> *subg_out_prev = subg_outputs[(i + 1) % 2];
+    (*subg_out_curr)[0] = outputs[0].At(i);
+
+    // Get a slice from the first input array.
     subg_inputs[0] = inputs[0].At(i);
-    // For the first iteration, the second argument is the second input array,
-    // i.e., the initial state.
-    if (i == 0)
-      subg_inputs[1] = inputs[1];
-    else
-      // For the rest of the iterations, the second argument is the output from
-      // the previous iteration.
-      subg_inputs[1] = subg_outputs[0];
-    subg_outputs[0] = outputs[0].At(i);
+    // For the rest of the iterations, the rest of the arguments are the outputs
+    // from the previous iteration.
+    if (i > 0) {
+      for (size_t j = 1; j < subg_out_prev->size(); j++)
+        subg_inputs[j] = (*subg_out_prev)[j];
+    }
 
-    ExecSubgraph(g, ctx, subg_inputs, req, subg_outputs);
+    ExecSubgraph(g, ctx, subg_inputs, req, *subg_out_curr);
     // We need to wait for the iteration to complete before executing
     // the next one or return from the loop. In this way, we can reuse
     // the memory in the subgraph.
-    for (size_t j = 0; j < subg_outputs.size(); j++)
-      subg_outputs[j].WaitToRead();
+    for (size_t j = 0; j < subg_out_curr->size(); j++)
+      (*subg_out_curr)[j].WaitToRead();
   }
 }
 
 static bool ForeachShape(const nnvm::NodeAttrs& attrs,
                          std::vector<TShape> *in_shape,
                          std::vector<TShape> *out_shape) {
-  CHECK_EQ(in_shape->size(), 2U);
   nnvm::ShapeVector shape_inputs = *in_shape;
   // foreach iterates over the first input NDArray over the first dimension.
   shape_inputs[0] = TShape(in_shape->at(0).begin() + 1, in_shape->at(0).end());
-  bool ret = shape_assign(&shape_inputs[1], shape_inputs[0]);
-  CHECK(ret);
   auto g = attrs.g;
   CHECK(g);
+  const auto& idx = g->indexed_graph();
+  CHECK_EQ(idx.input_nodes().size(), in_shape->size());
+  CHECK_EQ(idx.outputs().size(), out_shape->size());
   // TODO(zhengda) This can also be called in the execution engine.
   // We need to make it thread-safe.
   imperative::CheckAndInferShape(g.get(), std::move(shape_inputs), true);
   const auto& shapes = g->GetAttr<nnvm::ShapeVector>("shape");
-  CHECK(g->outputs.size() == 1);
-  uint32_t eid = g->indexed_graph().entry_id(g->outputs[0]);
+
+  // For the first shape.
+  uint32_t eid = idx.entry_id(g->outputs[0]);
   const auto& g_out_shape = shapes[eid];
-  const auto& in0 = (*in_shape)[0];
+  const auto &in0 = (*in_shape)[0];
+  auto &out0 = (*out_shape)[0];
   CHECK_EQ(g_out_shape.ndim() + 1, in0.ndim());
-  for (size_t i = 1; i < in0.ndim(); i++)
-    CHECK_EQ(in0[i], g_out_shape[i - 1]);
-  (*out_shape)[0] = in0;
+  out0 = in0;
+  for (size_t i = 1; i < out0.ndim(); i++)
+    out0[i] = g_out_shape[i - 1];
+
+  // For the remaining shapes.
+  for (size_t i = 1; i < g->outputs.size(); i++) {
+    uint32_t eid = idx.entry_id(g->outputs[i]);
+    (*out_shape)[i] = shapes[eid];
+  }
   return true;
 }
 
@@ -252,13 +302,15 @@ static bool ForeachType(const nnvm::NodeAttrs& attrs,
   nnvm::DTypeVector dtype_inputs = *in_type;
   auto g = attrs.g;
   CHECK(g);
+  const auto& idx = g->indexed_graph();
+  CHECK_EQ(idx.input_nodes().size(), in_type->size());
+  CHECK_EQ(idx.outputs().size(), out_type->size());
   // TODO(zhengda) This can also be called in the execution engine.
   // We need to make it thread-safe.
   imperative::CheckAndInferType(g.get(), std::move(dtype_inputs), true);
   const auto &dtypes = g->GetAttr<nnvm::DTypeVector>("dtype");
-  CHECK(g->outputs.size() == 1);
-  uint32_t eid = g->indexed_graph().entry_id(g->outputs[0]);
-  (*out_type)[0] = dtypes[eid];
+  for (size_t i = 0; i < g->outputs.size(); i++)
+    (*out_type)[i] = dtypes[idx.entry_id(g->outputs[i])];
   return true;
 }
 
@@ -270,7 +322,8 @@ static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
   auto g = attrs.g;
   CHECK(g);
   const auto& idx = g->indexed_graph();
-  CHECK(idx.input_nodes().size() == in_attrs->size());
+  CHECK_EQ(idx.input_nodes().size(), in_attrs->size());
+  CHECK_EQ(idx.outputs().size(), out_attrs->size());
   exec::DevMaskVector dev_masks(idx.num_nodes(), dev_mask);
   StorageTypeVector storage_type_inputs = *in_attrs;
   imperative::CheckAndInferStorageType(g.get(), std::move(dev_masks),
@@ -279,18 +332,23 @@ static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
   const auto& stypes = g->GetAttr<StorageTypeVector>("storage_type");
   auto &outputs = idx.outputs();
   CHECK(outputs.size() == out_attrs->size());
-  for (size_t i = 0; i < out_attrs->size(); i++) {
+  for (size_t i = 0; i < out_attrs->size(); i++)
     (*out_attrs)[i] = stypes[idx.entry_id(outputs[i])];
-  }
   return true;
 }
 
 NNVM_REGISTER_OP(Foreach)
 .describe(R"code(Foreach)code" ADD_FILELINE)
-//.set_attr_parser(ParamParser<ForeachParam>)
+.set_attr_parser(ParamParser<ForeachParam>)
 .set_attr<FInferStorageType>("FInferStorageType", ForeachStorageType)
-.set_num_inputs(3)
-.set_num_outputs(1)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
+  return params.num_args;
+})
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
+  return params.num_args - 1;
+})
 .set_attr<nnvm::FListInputNames>("FListInputNames",
     [](const NodeAttrs& attrs) {
   return std::vector<std::string>{"fn", "data1", "data2"};
@@ -302,9 +360,10 @@ NNVM_REGISTER_OP(Foreach)
 .set_attr<nnvm::FInferShape>("FInferShape", ForeachShape)
 .set_attr<nnvm::FInferType>("FInferType", ForeachType)
 .set_attr<FComputeEx>("FComputeEx<cpu>", ForeachComputeExCPU)
+.set_attr<std::string>("key_var_num_args", "num_args")
 .add_argument("fn", "Symbol", "Input graph.")
-.add_argument("data1", "NDArray-or-Symbol", "Input1.")
-.add_argument("data2", "NDArray-or-Symbol", "Input2.");
+.add_argument("input", "NDArray-or-Symbol", "The input array where we iterate over.")
+.add_argument("states", "NDArray-or-Symbol[]", "The list of initial states.");
 //.add_arguments(ForeachParam::__FIELDS__());
 
 }  // namespace op
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 900ce940aaff..cbee41d71f86 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5944,8 +5944,11 @@ def test_foreach():
     v3 = mx.sym.var("v3")
     v4 = mx.sym.var("v4")
     g = v1 + v2
+    # TODO This is problematic. We can't count on the user to define two different symbols.
+    g = mx.sym.Group([g, g * 1])
     out = mx.sym.Foreach(g, v3, v4)
-    out = out * 2
+    out1 = out[0] * 2
+    out = mx.sym.Group([out1, out[1]])
     arr1 = mx.nd.random.uniform(shape=(5, 2))
     arr2 = mx.nd.random.uniform(shape=(2))
     e = out.bind(ctx=mx.cpu(), args={'v3': arr1, 'v4': arr2})

From 84e0e24fb3296fb6abf5e638823e109d07658e51 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 6 Apr 2018 19:05:30 +0000
Subject: [PATCH 009/135] Add a python wrapper for foreach.

---
 python/mxnet/contrib/__init__.py       |  1 +
 python/mxnet/contrib/control_flow.py   | 39 ++++++++++++++++++++++++++
 tests/python/unittest/test_operator.py | 13 +++++----
 3 files changed, 47 insertions(+), 6 deletions(-)
 create mode 100644 python/mxnet/contrib/control_flow.py

diff --git a/python/mxnet/contrib/__init__.py b/python/mxnet/contrib/__init__.py
index fbfd3469678b..7489d97d90fe 100644
--- a/python/mxnet/contrib/__init__.py
+++ b/python/mxnet/contrib/__init__.py
@@ -32,3 +32,4 @@
 from . import io
 from . import quantization
 from . import quantization as quant
+from . import control_flow as cf
diff --git a/python/mxnet/contrib/control_flow.py b/python/mxnet/contrib/control_flow.py
new file mode 100644
index 000000000000..8b128fbb8d77
--- /dev/null
+++ b/python/mxnet/contrib/control_flow.py
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+
+def foreach(func, input, init_states, back_prop=False):
+    in_ele = mx.sym.var("in")
+    states = []
+    i = 0
+    assert isinstance(init_states, list), "init_states should be a list"
+    for s in init_states:
+        states.append(mx.sym.var("state" + str(i)))
+        i = i + 1
+    sym_out = func(in_ele, states)
+    # The function should return a tuple. The first element goes to
+    # the output of the function. The second element is a list.
+    assert isinstance(sym_out, tuple), "func should return a tuple (out, states)"
+    assert isinstance(sym_out[1], list), \
+            "the second element in the returned tuple should be a list"
+
+    flat_out = [sym_out[0]]
+    for s in sym_out[1]:
+        flat_out.append(s)
+    g = mx.sym.Group(flat_out)
+    return mx.sym.contrib.Foreach(g, input, *init_states)
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index cbee41d71f86..02da2b60eacf 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5939,14 +5939,15 @@ def test_float16_min_max():
 
 @with_seed()
 def test_foreach():
-    v1 = mx.sym.var("v1")
-    v2 = mx.sym.var("v2")
     v3 = mx.sym.var("v3")
     v4 = mx.sym.var("v4")
-    g = v1 + v2
-    # TODO This is problematic. We can't count on the user to define two different symbols.
-    g = mx.sym.Group([g, g * 1])
-    out = mx.sym.Foreach(g, v3, v4)
+
+    def step(in1, states):
+        out = in1 + states[0]
+        # TODO This is problematic. We can't count on the user to define two different symbols.
+        return (out, [out * 1])
+
+    out = mx.contrib.cf.foreach(step, v3, [v4])
     out1 = out[0] * 2
     out = mx.sym.Group([out1, out[1]])
     arr1 = mx.nd.random.uniform(shape=(5, 2))

From f2a28f04e41b8c5b6df33ed268036c6eb711dac5 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 6 Apr 2018 23:57:35 +0000
Subject: [PATCH 010/135] Fix the order of inputs.

---
 src/operator/nn/control_flow.cc | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 0ff7201e03bc..7d59fa7d5c30 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -171,6 +171,26 @@ struct ForeachParam : public dmlc::Parameter<ForeachParam> {
 
 DMLC_REGISTER_PARAMETER(ForeachParam);
 
+// The input arguments are ordered in the following order:
+// in, state0, state1, ...
+// We need to reorder them in the same order as the input nodes of the subgraph.
+template<typename T>
+static std::vector<T> ReorderInputs(const std::vector<T> &in, const nnvm::IndexedGraph& idx) {
+  std::vector<T> ret(in.size());
+  CHECK_EQ(idx.input_nodes().size(), in.size());
+  for (size_t i = 0; i < idx.input_nodes().size(); i++) {
+    std::string name = idx[idx.input_nodes()[i]].source->attrs.name;
+    if (name == "in") {
+      ret[i] = in[0];
+    } else {
+      auto idx_str = name.substr(5);
+      int idx = std::stoi(idx_str);
+      ret[i] = in[idx + 1];
+    }
+  }
+  return ret;
+}
+
 static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,
                                 const OpContext& ctx,
                                 const std::vector<NDArray>& inputs,
@@ -178,6 +198,7 @@ static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,
                                 const std::vector<NDArray>& outputs) {
   CHECK(attrs.g != nullptr);
   nnvm::Graph &g = *attrs.g;
+  const auto& idx = g.indexed_graph();
 
   // If this is inference, we only need the forward memory plan.
   bool has_mem_plan = !ctx.is_train && g.attrs.count("forward_mem_plan");
@@ -185,7 +206,6 @@ static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,
   has_mem_plan = has_mem_plan || (ctx.is_train && g.attrs.count("full_mem_plan"));
   // If we don't have a memory plan yet, we need to create a memory plan.
   if (!has_mem_plan) {
-    const auto& idx = g.indexed_graph();
     nnvm::StorageVector storage(idx.num_node_entries(), exec::kBadStorageID);
     for (const auto i : idx.input_nodes())
       storage[idx.entry_id(i, 0)] = exec::kExternalStorageID;
@@ -254,7 +274,8 @@ static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,
         subg_inputs[j] = (*subg_out_prev)[j];
     }
 
-    ExecSubgraph(g, ctx, subg_inputs, req, *subg_out_curr);
+    std::vector<NDArray> reordered_ins = ReorderInputs(subg_inputs, idx);
+    ExecSubgraph(g, ctx, reordered_ins, req, *subg_out_curr);
     // We need to wait for the iteration to complete before executing
     // the next one or return from the loop. In this way, we can reuse
     // the memory in the subgraph.
@@ -276,6 +297,7 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(idx.outputs().size(), out_shape->size());
   // TODO(zhengda) This can also be called in the execution engine.
   // We need to make it thread-safe.
+  shape_inputs = ReorderInputs(shape_inputs, idx);
   imperative::CheckAndInferShape(g.get(), std::move(shape_inputs), true);
   const auto& shapes = g->GetAttr<nnvm::ShapeVector>("shape");
 
@@ -307,6 +329,7 @@ static bool ForeachType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(idx.outputs().size(), out_type->size());
   // TODO(zhengda) This can also be called in the execution engine.
   // We need to make it thread-safe.
+  dtype_inputs = ReorderInputs(dtype_inputs, idx);
   imperative::CheckAndInferType(g.get(), std::move(dtype_inputs), true);
   const auto &dtypes = g->GetAttr<nnvm::DTypeVector>("dtype");
   for (size_t i = 0; i < g->outputs.size(); i++)
@@ -326,6 +349,7 @@ static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(idx.outputs().size(), out_attrs->size());
   exec::DevMaskVector dev_masks(idx.num_nodes(), dev_mask);
   StorageTypeVector storage_type_inputs = *in_attrs;
+  storage_type_inputs = ReorderInputs(storage_type_inputs, idx);
   imperative::CheckAndInferStorageType(g.get(), std::move(dev_masks),
                                        std::move(storage_type_inputs), true);
   *dispatch_mode = DispatchMode::kFComputeEx;

From 1c4cf0eb4b781cff59e137b0f6aae4fda2f85e58 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sat, 7 Apr 2018 00:06:49 +0000
Subject: [PATCH 011/135] hide C version of foreach.

---
 python/mxnet/contrib/control_flow.py | 2 +-
 src/operator/nn/control_flow.cc      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/mxnet/contrib/control_flow.py b/python/mxnet/contrib/control_flow.py
index 8b128fbb8d77..b6705ca4552a 100644
--- a/python/mxnet/contrib/control_flow.py
+++ b/python/mxnet/contrib/control_flow.py
@@ -36,4 +36,4 @@ def foreach(func, input, init_states, back_prop=False):
     for s in sym_out[1]:
         flat_out.append(s)
     g = mx.sym.Group(flat_out)
-    return mx.sym.contrib.Foreach(g, input, *init_states)
+    return mx.sym._internal._foreach(g, input, *init_states)
diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 7d59fa7d5c30..e95b88df2ad8 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -361,8 +361,8 @@ static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-NNVM_REGISTER_OP(Foreach)
-.describe(R"code(Foreach)code" ADD_FILELINE)
+NNVM_REGISTER_OP(_foreach)
+.describe(R"code(foreach)code" ADD_FILELINE)
 .set_attr_parser(ParamParser<ForeachParam>)
 .set_attr<FInferStorageType>("FInferStorageType", ForeachStorageType)
 .set_num_inputs([](const NodeAttrs& attrs) {

From 9aa896dc5e29a89b31f63c143af7ee491ddc68ec Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sat, 7 Apr 2018 01:11:26 +0000
Subject: [PATCH 012/135] fix a bug temporarily.

---
 python/mxnet/contrib/control_flow.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/mxnet/contrib/control_flow.py b/python/mxnet/contrib/control_flow.py
index b6705ca4552a..42d869208898 100644
--- a/python/mxnet/contrib/control_flow.py
+++ b/python/mxnet/contrib/control_flow.py
@@ -34,6 +34,9 @@ def foreach(func, input, init_states, back_prop=False):
 
     flat_out = [sym_out[0]]
     for s in sym_out[1]:
-        flat_out.append(s)
+        # There is a problem if the outputs are the same as the inputs
+        # or the first output.
+        # TODO this is a temp fix.
+        flat_out.append(mx.sym.identity(s))
     g = mx.sym.Group(flat_out)
     return mx.sym._internal._foreach(g, input, *init_states)

From 67844086d1ed2e35c21b3aab4cd282876258bded Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 6 Apr 2018 23:57:57 +0000
Subject: [PATCH 013/135] add test with lstm.

---
 tests/python/unittest/test_operator.py | 57 ++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 02da2b60eacf..301d1eabf82b 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5942,6 +5942,7 @@ def test_foreach():
     v3 = mx.sym.var("v3")
     v4 = mx.sym.var("v4")
 
+    # This tests foreach with accumulation sum.
     def step(in1, states):
         out = in1 + states[0]
         # TODO This is problematic. We can't count on the user to define two different symbols.
@@ -5966,6 +5967,62 @@ def step(in1, states):
     assert_almost_equal(e.outputs[0].asnumpy(), np_res, rtol=0.001, atol=0.0001)
 
 
+@with_seed()
+def test_foreach_lstm():
+    # This tests foreach with accumulation sum.
+    def step(in1, states):
+        params = mx.rnn.RNNParams()
+        params._params['i2h_weight'] = states[2]
+        params._params['h2h_weight'] = states[3]
+        params._params['i2h_bias'] = states[4]
+        params._params['h2h_bias'] = states[5]
+        lstm = mx.rnn.LSTMCell(4, prefix='mylstm_', params=params)
+        prev_states = [states[0], states[1]]
+        next_h, [next_h, next_c] = lstm(in1, prev_states)
+        # TODO This is problematic. We can't count on the user to define two different symbols.
+        return (next_h, [next_h, next_c, states[2], states[3], states[4], states[5]])
+
+    data = mx.sym.var("data")
+    init_h = mx.sym.var("h")
+    init_c = mx.sym.var("c")
+    i2h_weight = mx.sym.var("i2h_weight")
+    h2h_weight = mx.sym.var("h2h_weight")
+    i2h_bias = mx.sym.var("i2h_bias")
+    h2h_bias = mx.sym.var("h2h_bias")
+
+    data_arr = mx.nd.random.uniform(shape=(5, 2, 4))
+    h_arr = mx.nd.random.uniform(shape=(2, 4))
+    c_arr = mx.nd.random.uniform(shape=(2, 4))
+    i2h_warr = mx.nd.random.uniform(shape=(16, 4))
+    h2h_warr = mx.nd.random.uniform(shape=(16, 4))
+    i2h_barr = mx.nd.random.uniform(shape=(16))
+    h2h_barr = mx.nd.random.uniform(shape=(16))
+
+    out = mx.contrib.cf.foreach(step, data, [init_h, init_c, i2h_weight, h2h_weight, i2h_bias, h2h_bias])
+    e = out.bind(ctx=mx.cpu(), args={'data': data_arr, 'h': h_arr, 'c': c_arr,
+        'i2h_weight': i2h_warr, 'h2h_weight': h2h_warr, 'i2h_bias': i2h_barr, 'h2h_bias': h2h_barr})
+    e.forward()
+    outputs1 = e.outputs
+
+    lstm = mx.rnn.LSTMCell(4, prefix='mylstm_')
+    h = init_h
+    c = init_c
+    unroll_outs = []
+    for inputs in mx.sym.split(data, num_outputs=data_arr.shape[0], axis=0, squeeze_axis=True):
+        h, [h, c] = lstm(inputs, [h, c])
+        unroll_outs.append(mx.sym.expand_dims(h, axis=0))
+    unroll_outs = mx.sym.concat(*unroll_outs, dim=0)
+    out = mx.sym.Group([unroll_outs, h, c])
+    e = out.bind(ctx=mx.cpu(), args={'data': data_arr, 'h': h_arr, 'c': c_arr,
+        'mylstm_i2h_weight': i2h_warr, 'mylstm_h2h_weight': h2h_warr,
+        'mylstm_i2h_bias': i2h_barr, 'mylstm_h2h_bias': h2h_barr})
+    e.forward()
+    outputs2 = e.outputs
+
+    for i in range(len(outputs2)):
+        assert_almost_equal(outputs1[i].asnumpy(), outputs2[i].asnumpy(), rtol=0.001, atol=0.0001)
+
+
 @with_seed()
 def test_squeeze_op():
     def check_squeeze_op(shape, axis=None):

From f4886475192868716467100434cc12fc99298952 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sat, 7 Apr 2018 01:45:37 +0000
Subject: [PATCH 014/135] Test free variables.

---
 python/mxnet/contrib/control_flow.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/python/mxnet/contrib/control_flow.py b/python/mxnet/contrib/control_flow.py
index 42d869208898..781d6c7be9b1 100644
--- a/python/mxnet/contrib/control_flow.py
+++ b/python/mxnet/contrib/control_flow.py
@@ -19,11 +19,13 @@
 
 def foreach(func, input, init_states, back_prop=False):
     in_ele = mx.sym.var("in")
+    gin_names = ["in"]
     states = []
     i = 0
     assert isinstance(init_states, list), "init_states should be a list"
     for s in init_states:
         states.append(mx.sym.var("state" + str(i)))
+        gin_names.append("state" + str(i))
         i = i + 1
     sym_out = func(in_ele, states)
     # The function should return a tuple. The first element goes to
@@ -39,4 +41,9 @@ def foreach(func, input, init_states, back_prop=False):
         # TODO this is a temp fix.
         flat_out.append(mx.sym.identity(s))
     g = mx.sym.Group(flat_out)
+
+    # The input function can't have free variables right now.
+    for i in g.list_inputs():
+        assert i in gin_names, "The input function can't contain free variables"
+
     return mx.sym._internal._foreach(g, input, *init_states)

From d9b0c50f8d3723767da8b2aba3e0ad819eb1aec4 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Mon, 9 Apr 2018 13:09:56 -0700
Subject: [PATCH 015/135] change for the new interface of InputGraph attribute.

---
 src/executor/attach_op_execs_pass.cc |  4 ++--
 src/operator/nn/control_flow.cc      | 15 +++++++++------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index eb0d0052e5f7..ca0c76e9df8f 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -208,7 +208,7 @@ class FComputeExecutor : public StorageFallbackOpExecutor {
   }
 
   bool HasSubgraph() const override {
-    return attrs_.g != nullptr;
+    return !attrs_.subgraphs.empty();
   }
 
   explicit FComputeExecutor(const NodeAttrs& attrs, FCompute fcompute,
@@ -237,7 +237,7 @@ class FComputeExExecutor : public OpExecutor {
   void Setup() override {}
 
   bool HasSubgraph() const override {
-    return attrs_.g != nullptr;
+    return !attrs_.subgraphs.empty();
   }
 
   ExecType exec_type() const override {
diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index e95b88df2ad8..99a12a9fa901 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -196,8 +196,8 @@ static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,
                                 const std::vector<NDArray>& inputs,
                                 const std::vector<OpReqType>& req,
                                 const std::vector<NDArray>& outputs) {
-  CHECK(attrs.g != nullptr);
-  nnvm::Graph &g = *attrs.g;
+  CHECK_EQ(attrs.subgraphs.size(), 1U);
+  nnvm::Graph &g = *attrs.subgraphs[0];
   const auto& idx = g.indexed_graph();
 
   // If this is inference, we only need the forward memory plan.
@@ -290,7 +290,8 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
   nnvm::ShapeVector shape_inputs = *in_shape;
   // foreach iterates over the first input NDArray over the first dimension.
   shape_inputs[0] = TShape(in_shape->at(0).begin() + 1, in_shape->at(0).end());
-  auto g = attrs.g;
+  CHECK_EQ(attrs.subgraphs.size(), 1U);
+  auto g = attrs.subgraphs[0];
   CHECK(g);
   const auto& idx = g->indexed_graph();
   CHECK_EQ(idx.input_nodes().size(), in_shape->size());
@@ -322,7 +323,8 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
 static bool ForeachType(const nnvm::NodeAttrs& attrs,
                         std::vector<int> *in_type, std::vector<int> *out_type) {
   nnvm::DTypeVector dtype_inputs = *in_type;
-  auto g = attrs.g;
+  CHECK_EQ(attrs.subgraphs.size(), 1U);
+  auto g = attrs.subgraphs[0];
   CHECK(g);
   const auto& idx = g->indexed_graph();
   CHECK_EQ(idx.input_nodes().size(), in_type->size());
@@ -342,7 +344,8 @@ static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
                                DispatchMode* dispatch_mode,
                                std::vector<int> *in_attrs,
                                std::vector<int> *out_attrs) {
-  auto g = attrs.g;
+  CHECK_EQ(attrs.subgraphs.size(), 1U);
+  auto g = attrs.subgraphs[0];
   CHECK(g);
   const auto& idx = g->indexed_graph();
   CHECK_EQ(idx.input_nodes().size(), in_attrs->size());
@@ -379,7 +382,7 @@ NNVM_REGISTER_OP(_foreach)
 })
 .set_attr<nnvm::FInputGraph>("FInputGraph",
     [](const NodeAttrs& attrs) {
-  return 0;
+  return std::vector<uint32_t>{0};
 })
 .set_attr<nnvm::FInferShape>("FInferShape", ForeachShape)
 .set_attr<nnvm::FInferType>("FInferType", ForeachType)

From 1188d9745127af453b5c0c5ee34ee136aa2be945 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 11 Apr 2018 21:04:40 +0000
Subject: [PATCH 016/135] Add attribute to the subgraph.

---
 python/mxnet/contrib/control_flow.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/python/mxnet/contrib/control_flow.py b/python/mxnet/contrib/control_flow.py
index 781d6c7be9b1..a839c467ad7e 100644
--- a/python/mxnet/contrib/control_flow.py
+++ b/python/mxnet/contrib/control_flow.py
@@ -17,7 +17,7 @@
 
 import mxnet as mx
 
-def foreach(func, input, init_states, back_prop=False):
+def foreach(func, input, init_states, back_prop=False, name="foreach"):
     in_ele = mx.sym.var("in")
     gin_names = ["in"]
     states = []
@@ -27,19 +27,20 @@ def foreach(func, input, init_states, back_prop=False):
         states.append(mx.sym.var("state" + str(i)))
         gin_names.append("state" + str(i))
         i = i + 1
-    sym_out = func(in_ele, states)
-    # The function should return a tuple. The first element goes to
-    # the output of the function. The second element is a list.
-    assert isinstance(sym_out, tuple), "func should return a tuple (out, states)"
-    assert isinstance(sym_out[1], list), \
-            "the second element in the returned tuple should be a list"
+    with mx.AttrScope(subgraph_name=name):
+        sym_out = func(in_ele, states)
+        # The function should return a tuple. The first element goes to
+        # the output of the function. The second element is a list.
+        assert isinstance(sym_out, tuple), "func should return a tuple (out, states)"
+        assert isinstance(sym_out[1], list), \
+                "the second element in the returned tuple should be a list"
 
-    flat_out = [sym_out[0]]
-    for s in sym_out[1]:
-        # There is a problem if the outputs are the same as the inputs
-        # or the first output.
-        # TODO this is a temp fix.
-        flat_out.append(mx.sym.identity(s))
+        flat_out = [sym_out[0]]
+        for s in sym_out[1]:
+            # There is a problem if the outputs are the same as the inputs
+            # or the first output.
+            # TODO this is a temp fix.
+            flat_out.append(mx.sym.identity(s))
     g = mx.sym.Group(flat_out)
 
     # The input function can't have free variables right now.

From c0cd6aca87af4f22281dd27a15609a0fb2f59558 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 11 Apr 2018 22:21:20 +0000
Subject: [PATCH 017/135] Handle free variables.

---
 python/mxnet/contrib/control_flow.py   | 39 ++++++++++++++++++++++----
 src/c_api/c_api_symbolic.cc            |  5 ++--
 src/operator/nn/control_flow.cc        | 36 ++++++++++++++++--------
 tests/python/unittest/test_operator.py | 27 +++++++++---------
 4 files changed, 73 insertions(+), 34 deletions(-)

diff --git a/python/mxnet/contrib/control_flow.py b/python/mxnet/contrib/control_flow.py
index a839c467ad7e..2e3a8997b016 100644
--- a/python/mxnet/contrib/control_flow.py
+++ b/python/mxnet/contrib/control_flow.py
@@ -24,8 +24,8 @@ def foreach(func, input, init_states, back_prop=False, name="foreach"):
     i = 0
     assert isinstance(init_states, list), "init_states should be a list"
     for s in init_states:
-        states.append(mx.sym.var("state" + str(i)))
-        gin_names.append("state" + str(i))
+        states.append(mx.sym.var(s.name))
+        gin_names.append(s.name)
         i = i + 1
     with mx.AttrScope(subgraph_name=name):
         sym_out = func(in_ele, states)
@@ -43,8 +43,35 @@ def foreach(func, input, init_states, back_prop=False, name="foreach"):
             flat_out.append(mx.sym.identity(s))
     g = mx.sym.Group(flat_out)
 
-    # The input function can't have free variables right now.
-    for i in g.list_inputs():
-        assert i in gin_names, "The input function can't contain free variables"
+    # Find free variables in the python that are symbols.
+    freevars = dict(zip(func.func_code.co_freevars,
+        (c.cell_contents for c in func.func_closure)))
+    sym_freevars = []
+    for name in freevars:
+        val = freevars[name]
+        if isinstance(val, mx.sym.Symbol):
+            # We need to save the original symbol first.
+            sym_freevars.append(val)
+            gin_names.append(name)
 
-    return mx.sym._internal._foreach(g, input, *init_states)
+    if (isinstance(input, list)):
+        num_inputs = len(input)
+    else:
+        num_inputs = 1
+
+    # Here we need to find out how the input symbols are ordered as well as
+    # where the loop states are located in the list of inputs.
+    ins = init_states + sym_freevars
+    ins = {sym.name:sym for sym in ins}
+    ordered_ins = []
+    in_state_locs = [-1] * len(init_states)
+    for in_name in g.list_inputs():
+        assert in_name in gin_names, "The input graph contains variables we can't find"
+        if in_name in ins:
+            ordered_ins.append(ins[in_name])
+            for i in range(len(init_states)):
+                if (init_states[i].name == in_name):
+                    in_state_locs[i] = len(ordered_ins) - 1 + num_inputs
+
+    return mx.sym._internal._foreach(g, input, *ordered_ins, num_outputs=len(flat_out),
+                                     in_state_locs=in_state_locs)
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index e5e9b522890b..fed685489f64 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -38,10 +38,11 @@ void RegisterLegacyOpProp();
 void RegisterLegacyNDFunc();
 }
 const std::vector<std::string> kHiddenKeys = {
-  "ctx_group", "lr_mult", "wd_mult", "force_mirroring", "mirror_stage"
+  "ctx_group", "lr_mult", "wd_mult", "force_mirroring", "mirror_stage", "subgraph_name"
 };
 const std::vector<std::string> kReplacedHiddenKeys = {
-  "__ctx_group__", "__lr_mult__", "__wd_mult__", "__force_mirroring__", "__mirror_stage__"
+  "__ctx_group__", "__lr_mult__", "__wd_mult__", "__force_mirroring__", "__mirror_stage__",
+  "subgraph_name"
 };
 const char *kNamespaceSeparator = "$";
 
diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 99a12a9fa901..135694742bed 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -161,11 +161,17 @@ static void ExecSubgraph(nnvm::Graph &g, const OpContext& ctx,
 struct ForeachParam : public dmlc::Parameter<ForeachParam> {
   int num_args;
   int dim;
+  int num_outputs;
+  nnvm::Tuple<dim_t> in_state_locs;
   DMLC_DECLARE_PARAMETER(ForeachParam) {
     DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
     .describe("Number of inputs.");
     DMLC_DECLARE_FIELD(dim).set_default(1)
     .describe("the dimension of the input array to iterate.");
+    DMLC_DECLARE_FIELD(num_outputs)
+    .describe("The number of outputs of the subgraph.");
+    DMLC_DECLARE_FIELD(in_state_locs)
+    .describe("The locations of loop states among the inputs.");
   }
 };  // struct ForeachParam
 
@@ -196,6 +202,8 @@ static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,
                                 const std::vector<NDArray>& inputs,
                                 const std::vector<OpReqType>& req,
                                 const std::vector<NDArray>& outputs) {
+  const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
+  CHECK_EQ(outputs.size(), (size_t) params.num_outputs);
   CHECK_EQ(attrs.subgraphs.size(), 1U);
   nnvm::Graph &g = *attrs.subgraphs[0];
   const auto& idx = g.indexed_graph();
@@ -238,8 +246,8 @@ static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,
   // The states from the previous iteration are used as the inputs of the next
   // iteration, so I have to maintain two arrays, so the inputs and outputs
   // of the subgraph share the same memory.
-  std::vector<NDArray> subg_outputs1(inputs.size());
-  std::vector<NDArray> subg_outputs2(inputs.size());
+  std::vector<NDArray> subg_outputs1(outputs.size());
+  std::vector<NDArray> subg_outputs2(outputs.size());
   std::vector<NDArray> *subg_outputs[2]{&subg_outputs1, &subg_outputs2};
   // If the length is an odd number, the last iteration will use the first set
   // of outputs. In this way, we don't need to copy the results from the
@@ -270,12 +278,13 @@ static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,
     // For the rest of the iterations, the rest of the arguments are the outputs
     // from the previous iteration.
     if (i > 0) {
-      for (size_t j = 1; j < subg_out_prev->size(); j++)
-        subg_inputs[j] = (*subg_out_prev)[j];
+      for (size_t j = 1; j < subg_out_prev->size(); j++) {
+        CHECK_LT(params.in_state_locs[j - 1], subg_inputs.size());
+        subg_inputs[params.in_state_locs[j - 1]] = (*subg_out_prev)[j];
+      }
     }
 
-    std::vector<NDArray> reordered_ins = ReorderInputs(subg_inputs, idx);
-    ExecSubgraph(g, ctx, reordered_ins, req, *subg_out_curr);
+    ExecSubgraph(g, ctx, subg_inputs, req, *subg_out_curr);
     // We need to wait for the iteration to complete before executing
     // the next one or return from the loop. In this way, we can reuse
     // the memory in the subgraph.
@@ -287,6 +296,8 @@ static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,
 static bool ForeachShape(const nnvm::NodeAttrs& attrs,
                          std::vector<TShape> *in_shape,
                          std::vector<TShape> *out_shape) {
+  const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
+  CHECK_EQ(out_shape->size(), (size_t) params.num_outputs);
   nnvm::ShapeVector shape_inputs = *in_shape;
   // foreach iterates over the first input NDArray over the first dimension.
   shape_inputs[0] = TShape(in_shape->at(0).begin() + 1, in_shape->at(0).end());
@@ -298,7 +309,6 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(idx.outputs().size(), out_shape->size());
   // TODO(zhengda) This can also be called in the execution engine.
   // We need to make it thread-safe.
-  shape_inputs = ReorderInputs(shape_inputs, idx);
   imperative::CheckAndInferShape(g.get(), std::move(shape_inputs), true);
   const auto& shapes = g->GetAttr<nnvm::ShapeVector>("shape");
 
@@ -322,6 +332,8 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
 
 static bool ForeachType(const nnvm::NodeAttrs& attrs,
                         std::vector<int> *in_type, std::vector<int> *out_type) {
+  const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
+  CHECK_EQ(out_type->size(), (size_t) params.num_outputs);
   nnvm::DTypeVector dtype_inputs = *in_type;
   CHECK_EQ(attrs.subgraphs.size(), 1U);
   auto g = attrs.subgraphs[0];
@@ -331,7 +343,6 @@ static bool ForeachType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(idx.outputs().size(), out_type->size());
   // TODO(zhengda) This can also be called in the execution engine.
   // We need to make it thread-safe.
-  dtype_inputs = ReorderInputs(dtype_inputs, idx);
   imperative::CheckAndInferType(g.get(), std::move(dtype_inputs), true);
   const auto &dtypes = g->GetAttr<nnvm::DTypeVector>("dtype");
   for (size_t i = 0; i < g->outputs.size(); i++)
@@ -344,6 +355,8 @@ static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
                                DispatchMode* dispatch_mode,
                                std::vector<int> *in_attrs,
                                std::vector<int> *out_attrs) {
+  const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
+  CHECK_EQ(out_attrs->size(), (size_t) params.num_outputs);
   CHECK_EQ(attrs.subgraphs.size(), 1U);
   auto g = attrs.subgraphs[0];
   CHECK(g);
@@ -352,7 +365,6 @@ static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(idx.outputs().size(), out_attrs->size());
   exec::DevMaskVector dev_masks(idx.num_nodes(), dev_mask);
   StorageTypeVector storage_type_inputs = *in_attrs;
-  storage_type_inputs = ReorderInputs(storage_type_inputs, idx);
   imperative::CheckAndInferStorageType(g.get(), std::move(dev_masks),
                                        std::move(storage_type_inputs), true);
   *dispatch_mode = DispatchMode::kFComputeEx;
@@ -374,7 +386,7 @@ NNVM_REGISTER_OP(_foreach)
 })
 .set_num_outputs([](const NodeAttrs& attrs) {
   const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
-  return params.num_args - 1;
+  return params.num_outputs;
 })
 .set_attr<nnvm::FListInputNames>("FListInputNames",
     [](const NodeAttrs& attrs) {
@@ -390,8 +402,8 @@ NNVM_REGISTER_OP(_foreach)
 .set_attr<std::string>("key_var_num_args", "num_args")
 .add_argument("fn", "Symbol", "Input graph.")
 .add_argument("input", "NDArray-or-Symbol", "The input array where we iterate over.")
-.add_argument("states", "NDArray-or-Symbol[]", "The list of initial states.");
-//.add_arguments(ForeachParam::__FIELDS__());
+.add_argument("states", "NDArray-or-Symbol[]", "The list of initial states.")
+.add_arguments(ForeachParam::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 301d1eabf82b..44f432b98ac2 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5969,19 +5969,6 @@ def step(in1, states):
 
 @with_seed()
 def test_foreach_lstm():
-    # This tests foreach with accumulation sum.
-    def step(in1, states):
-        params = mx.rnn.RNNParams()
-        params._params['i2h_weight'] = states[2]
-        params._params['h2h_weight'] = states[3]
-        params._params['i2h_bias'] = states[4]
-        params._params['h2h_bias'] = states[5]
-        lstm = mx.rnn.LSTMCell(4, prefix='mylstm_', params=params)
-        prev_states = [states[0], states[1]]
-        next_h, [next_h, next_c] = lstm(in1, prev_states)
-        # TODO This is problematic. We can't count on the user to define two different symbols.
-        return (next_h, [next_h, next_c, states[2], states[3], states[4], states[5]])
-
     data = mx.sym.var("data")
     init_h = mx.sym.var("h")
     init_c = mx.sym.var("c")
@@ -5990,6 +5977,18 @@ def step(in1, states):
     i2h_bias = mx.sym.var("i2h_bias")
     h2h_bias = mx.sym.var("h2h_bias")
 
+    # This tests foreach with accumulation sum.
+    def step(in1, states):
+        params = mx.rnn.RNNParams()
+        params._params['i2h_weight'] = i2h_weight
+        params._params['h2h_weight'] = h2h_weight
+        params._params['i2h_bias'] = i2h_bias
+        params._params['h2h_bias'] = h2h_bias
+        lstm = mx.rnn.LSTMCell(4, prefix='mylstm_', params=params)
+        next_h, [next_h, next_c] = lstm(in1, states)
+        # TODO This is problematic. We can't count on the user to define two different symbols.
+        return (next_h, [next_h, next_c])
+
     data_arr = mx.nd.random.uniform(shape=(5, 2, 4))
     h_arr = mx.nd.random.uniform(shape=(2, 4))
     c_arr = mx.nd.random.uniform(shape=(2, 4))
@@ -5998,7 +5997,7 @@ def step(in1, states):
     i2h_barr = mx.nd.random.uniform(shape=(16))
     h2h_barr = mx.nd.random.uniform(shape=(16))
 
-    out = mx.contrib.cf.foreach(step, data, [init_h, init_c, i2h_weight, h2h_weight, i2h_bias, h2h_bias])
+    out = mx.contrib.cf.foreach(step, data, [init_h, init_c])
     e = out.bind(ctx=mx.cpu(), args={'data': data_arr, 'h': h_arr, 'c': c_arr,
         'i2h_weight': i2h_warr, 'h2h_weight': h2h_warr, 'i2h_bias': i2h_barr, 'h2h_bias': h2h_barr})
     e.forward()

From 74d280beec514337ed0729df5a46bdfd5418f54e Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 13 Apr 2018 18:43:14 +0000
Subject: [PATCH 018/135] Get all input symbols of a subgraph.

---
 include/mxnet/c_api.h                | 10 ++++
 python/mxnet/contrib/control_flow.py | 85 +++++++++++++++++-----------
 src/c_api/c_api_symbolic.cc          | 27 +++++++++
 3 files changed, 89 insertions(+), 33 deletions(-)

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 4dd858a51c4b..791a7e3aca29 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1051,6 +1051,16 @@ MXNET_DLL int MXSymbolListAtomicSymbolCreators(mx_uint *out_size,
  */
 MXNET_DLL int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator creator,
                                           const char **name);
+
+/*!
+ * \brief Get the input symbols of the graph.
+ * \param sym The graph.
+ * \param outs The input symbols of the graph.
+ * \param out_size the number of input symbols returned.
+ */
+MXNET_DLL int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle **outs,
+                                      int *out_size);
+
 /*!
  * \brief Get the detailed information about atomic symbol.
  * \param creator the AtomicSymbolCreator.
diff --git a/python/mxnet/contrib/control_flow.py b/python/mxnet/contrib/control_flow.py
index 2e3a8997b016..6175df124eb1 100644
--- a/python/mxnet/contrib/control_flow.py
+++ b/python/mxnet/contrib/control_flow.py
@@ -15,44 +15,54 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import mxnet as mx
+import ctypes
+
+from .. import symbol
+from ..base import _LIB, c_str, c_array, check_call
+from ..base import SymbolHandle, NDArrayHandle
+from ..attribute import AttrScope
+
+def _get_graph_inputs(subg, name, prefix):
+    num_handles = ctypes.c_int(1000)
+    handles = c_array(SymbolHandle, [SymbolHandle(0) for i in range(1000)])
+    check_call(_LIB.MXSymbolGetInputSymbols(subg.handle, handles,
+        ctypes.byref(num_handles)))
+
+    syms = []
+    for i in range(num_handles.value):
+        s = symbol.Symbol(handles[i])
+        syms.append(s)
+    return syms
 
 def foreach(func, input, init_states, back_prop=False, name="foreach"):
-    in_ele = mx.sym.var("in")
-    gin_names = ["in"]
-    states = []
-    i = 0
     assert isinstance(init_states, list), "init_states should be a list"
-    for s in init_states:
-        states.append(mx.sym.var(s.name))
-        gin_names.append(s.name)
-        i = i + 1
-    with mx.AttrScope(subgraph_name=name):
+    states = []
+    with AttrScope(subgraph_name=name):
+        in_ele = symbol.var("in")
+        for s in init_states:
+            states.append(symbol.var(s.name))
+
         sym_out = func(in_ele, states)
         # The function should return a tuple. The first element goes to
         # the output of the function. The second element is a list.
         assert isinstance(sym_out, tuple), "func should return a tuple (out, states)"
         assert isinstance(sym_out[1], list), \
                 "the second element in the returned tuple should be a list"
+        assert len(sym_out[1]) == len(init_states), \
+                "the number of output states (%d) should be the same as input states (%d)" \
+                % (len(sym_out[1]), len(init_states))
 
-        flat_out = [sym_out[0]]
+        if (isinstance(sym_out[0], list)):
+            flat_out = sym_out[0]
+        else:
+            flat_out = [sym_out[0]]
         for s in sym_out[1]:
             # There is a problem if the outputs are the same as the inputs
             # or the first output.
             # TODO this is a temp fix.
-            flat_out.append(mx.sym.identity(s))
-    g = mx.sym.Group(flat_out)
-
-    # Find free variables in the python that are symbols.
-    freevars = dict(zip(func.func_code.co_freevars,
-        (c.cell_contents for c in func.func_closure)))
-    sym_freevars = []
-    for name in freevars:
-        val = freevars[name]
-        if isinstance(val, mx.sym.Symbol):
-            # We need to save the original symbol first.
-            sym_freevars.append(val)
-            gin_names.append(name)
+            flat_out.append(symbol.identity(s))
+    g = symbol.Group(flat_out)
+    input_syms = _get_graph_inputs(g, name, "ro_var")
 
     if (isinstance(input, list)):
         num_inputs = len(input)
@@ -61,17 +71,26 @@ def foreach(func, input, init_states, back_prop=False, name="foreach"):
 
     # Here we need to find out how the input symbols are ordered as well as
     # where the loop states are located in the list of inputs.
-    ins = init_states + sym_freevars
-    ins = {sym.name:sym for sym in ins}
+
+    # This dict contains the symbols of the subgraph.
+    input_syms = {sym.name:sym for sym in input_syms}
+    gin_names = input_syms.keys()
+    # This array contains the symbols for the inputs of foreach.
     ordered_ins = []
+    states_map = {sym.name:sym for sym in init_states}
+    state_names = states_map.keys()
     in_state_locs = [-1] * len(init_states)
     for in_name in g.list_inputs():
-        assert in_name in gin_names, "The input graph contains variables we can't find"
-        if in_name in ins:
-            ordered_ins.append(ins[in_name])
-            for i in range(len(init_states)):
-                if (init_states[i].name == in_name):
-                    in_state_locs[i] = len(ordered_ins) - 1 + num_inputs
+        assert in_name in gin_names, "The input variable %s can't be found in graph inputs: %s" \
+                % (in_name, str(gin_names))
+        if (in_name in state_names):
+            ordered_ins.append(states_map[in_name])
+        elif (in_name != "in"):
+            ordered_ins.append(input_syms[in_name])
+
+        for i in range(len(init_states)):
+            if (init_states[i].name == in_name):
+                in_state_locs[i] = len(ordered_ins) - 1 + num_inputs
 
-    return mx.sym._internal._foreach(g, input, *ordered_ins, num_outputs=len(flat_out),
+    return symbol._internal._foreach(g, input, *ordered_ins, num_outputs=len(flat_out),
                                      in_state_locs=in_state_locs)
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index fed685489f64..18e03544caf1 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -345,6 +345,33 @@ int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator creator,
   API_END();
 }
 
+int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle **out_arr, int *out_size) {
+  API_BEGIN();
+  nnvm::Symbol *s = static_cast<nnvm::Symbol*>(sym);
+  nnvm::Graph g;
+  g.outputs = s->outputs;
+  std::vector<nnvm::Symbol *> input_syms;
+  const nnvm::IndexedGraph& idx = g.indexed_graph();
+  size_t max_out_size = *out_size;
+  // Go through all nodes and return the ones representing variables.
+  for (size_t i = 0; i < idx.num_nodes(); i++) {
+    const nnvm::Node &n = *idx[i].source;
+    for (const nnvm::NodeEntry &e : n.inputs) {
+      auto p = e.node;
+      if (p->is_variable()) {
+        nnvm::Symbol *s = new nnvm::Symbol();
+        s->outputs.push_back(e);
+        input_syms.push_back(s);
+        std::cout << p->attrs.name << std::endl;
+      }
+    }
+  }
+  CHECK(input_syms.size() <= max_out_size);
+  *out_size = input_syms.size();
+  memcpy(out_arr, input_syms.data(), sizeof(*out_arr) * input_syms.size());
+  API_END();
+}
+
 int MXSymbolCreateFromFile(const char *fname, SymbolHandle *out) {
   nnvm::Symbol *s = new nnvm::Symbol();
   API_BEGIN();

From 2bc80e38e54793be2bf4449722735b8270156971 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 13 Apr 2018 23:38:52 +0000
Subject: [PATCH 019/135] Fix shape, dtype and storage inference.

---
 src/operator/nn/control_flow.cc | 36 ++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 135694742bed..9ada9f105624 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -312,6 +312,16 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
   imperative::CheckAndInferShape(g.get(), std::move(shape_inputs), true);
   const auto& shapes = g->GetAttr<nnvm::ShapeVector>("shape");
 
+  // Inferring the shape in the subgraph may infer the shape of the inputs.
+  // We need to copy the inferred input shapes back.
+  const auto &input_nids = idx.input_nodes();
+  CHECK_EQ(input_nids.size(), in_shape->size());
+  size_t num_input_arrays = 1;
+  for (size_t i = num_input_arrays; i < in_shape->size(); i++) {
+    auto eid = idx.entry_id(input_nids[i], 0);
+    (*in_shape)[i] = shapes[eid];
+  }
+
   // For the first shape.
   uint32_t eid = idx.entry_id(g->outputs[0]);
   const auto& g_out_shape = shapes[eid];
@@ -344,7 +354,19 @@ static bool ForeachType(const nnvm::NodeAttrs& attrs,
   // TODO(zhengda) This can also be called in the execution engine.
   // We need to make it thread-safe.
   imperative::CheckAndInferType(g.get(), std::move(dtype_inputs), true);
+
+  size_t num_input_arrays = 1;
   const auto &dtypes = g->GetAttr<nnvm::DTypeVector>("dtype");
+
+  // Inferring the data type in the subgraph may infer the data type of the inputs.
+  // We need to copy the inferred input data types back.
+  const auto &input_nids = idx.input_nodes();
+  CHECK_EQ(input_nids.size(), in_type->size());
+  for (size_t i = num_input_arrays; i < in_type->size(); i++) {
+    auto eid = idx.entry_id(input_nids[i], 0);
+    (*in_type)[i] = dtypes[eid];
+  }
+
   for (size_t i = 0; i < g->outputs.size(); i++)
     (*out_type)[i] = dtypes[idx.entry_id(g->outputs[i])];
   return true;
@@ -367,8 +389,20 @@ static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
   StorageTypeVector storage_type_inputs = *in_attrs;
   imperative::CheckAndInferStorageType(g.get(), std::move(dev_masks),
                                        std::move(storage_type_inputs), true);
-  *dispatch_mode = DispatchMode::kFComputeEx;
+
+  size_t num_input_arrays = 1;
   const auto& stypes = g->GetAttr<StorageTypeVector>("storage_type");
+
+  // Inferring the storage in the subgraph may infer the storage of the inputs.
+  // We need to copy the inferred input storage back.
+  const auto &input_nids = idx.input_nodes();
+  CHECK_EQ(input_nids.size(), in_attrs->size());
+  for (size_t i = num_input_arrays; i < in_attrs->size(); i++) {
+    auto eid = idx.entry_id(input_nids[i], 0);
+    (*in_attrs)[i] = stypes[eid];
+  }
+
+  *dispatch_mode = DispatchMode::kFComputeEx;
   auto &outputs = idx.outputs();
   CHECK(outputs.size() == out_attrs->size());
   for (size_t i = 0; i < out_attrs->size(); i++)

From 68faa17b188d0bfcbf160d5f2faf83c93c199bb2 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sat, 14 Apr 2018 00:02:42 +0000
Subject: [PATCH 020/135] reorganize the output of foreach.

---
 python/mxnet/contrib/control_flow.py   | 17 +++++++++++++++--
 tests/python/unittest/test_operator.py | 14 +++++++++++---
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/python/mxnet/contrib/control_flow.py b/python/mxnet/contrib/control_flow.py
index 6175df124eb1..df3716040aa9 100644
--- a/python/mxnet/contrib/control_flow.py
+++ b/python/mxnet/contrib/control_flow.py
@@ -92,5 +92,18 @@ def foreach(func, input, init_states, back_prop=False, name="foreach"):
             if (init_states[i].name == in_name):
                 in_state_locs[i] = len(ordered_ins) - 1 + num_inputs
 
-    return symbol._internal._foreach(g, input, *ordered_ins, num_outputs=len(flat_out),
-                                     in_state_locs=in_state_locs)
+    num_outputs = len(flat_out)
+    num_states = len(state_names)
+    ret = symbol._internal._foreach(g, input, *ordered_ins, num_outputs=num_outputs,
+                                    in_state_locs=in_state_locs)
+    if (num_outputs - num_states > 1):
+        outs = []
+        for i in range(num_outputs - num_states):
+            outs.append(ret[i])
+    else:
+        outs = ret[0]
+    states = []
+    for i in range(num_states):
+        states.append(ret[num_outputs - num_states + i])
+
+    return (outs, states)
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 44f432b98ac2..892418e48caf 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5945,12 +5945,11 @@ def test_foreach():
     # This tests foreach with accumulation sum.
     def step(in1, states):
         out = in1 + states[0]
-        # TODO This is problematic. We can't count on the user to define two different symbols.
-        return (out, [out * 1])
+        return (out, [out])
 
     out = mx.contrib.cf.foreach(step, v3, [v4])
     out1 = out[0] * 2
-    out = mx.sym.Group([out1, out[1]])
+    out = mx.sym.Group([out1, out[1][0]])
     arr1 = mx.nd.random.uniform(shape=(5, 2))
     arr2 = mx.nd.random.uniform(shape=(2))
     e = out.bind(ctx=mx.cpu(), args={'v3': arr1, 'v4': arr2})
@@ -5989,6 +5988,14 @@ def step(in1, states):
         # TODO This is problematic. We can't count on the user to define two different symbols.
         return (next_h, [next_h, next_c])
 
+    def sym_group(out):
+        if (isinstance(out[0], mx.sym.Symbol)):
+            ret = [out[0]]
+        else:
+            ret = out[0]
+        ret.extend(out[1])
+        return mx.sym.Group(ret)
+
     data_arr = mx.nd.random.uniform(shape=(5, 2, 4))
     h_arr = mx.nd.random.uniform(shape=(2, 4))
     c_arr = mx.nd.random.uniform(shape=(2, 4))
@@ -5998,6 +6005,7 @@ def step(in1, states):
     h2h_barr = mx.nd.random.uniform(shape=(16))
 
     out = mx.contrib.cf.foreach(step, data, [init_h, init_c])
+    out = sym_group(out)
     e = out.bind(ctx=mx.cpu(), args={'data': data_arr, 'h': h_arr, 'c': c_arr,
         'i2h_weight': i2h_warr, 'h2h_weight': h2h_warr, 'i2h_bias': i2h_barr, 'h2h_bias': h2h_barr})
     e.forward()

From 3751ca7f16f17b60e137f5ece82c4f276a4f5760 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sat, 14 Apr 2018 00:05:09 +0000
Subject: [PATCH 021/135] Add a gluon RNN unroll with symbol foreach.

---
 python/mxnet/gluon/contrib/rnn/rnn_cell.py | 149 ++++++++++++++++++++-
 tests/python/unittest/test_gluon_rnn.py    |  14 +-
 2 files changed, 161 insertions(+), 2 deletions(-)

diff --git a/python/mxnet/gluon/contrib/rnn/rnn_cell.py b/python/mxnet/gluon/contrib/rnn/rnn_cell.py
index 1b9afee14bf2..7cd2d0a4a98f 100644
--- a/python/mxnet/gluon/contrib/rnn/rnn_cell.py
+++ b/python/mxnet/gluon/contrib/rnn/rnn_cell.py
@@ -17,10 +17,16 @@
 
 # coding: utf-8
 """Definition of various recurrent neural network cells."""
-__all__ = ['VariationalDropoutCell', 'LSTMPCell']
+__all__ = ['VariationalDropoutCell', 'LSTMPCell', 'SymHybridRNNCell', 'RNNCell']
 
+import inspect
+
+from .... import symbol, ndarray
+from ....base import _as_list
+from ....contrib.control_flow import foreach
 from ...rnn import BidirectionalCell, SequentialRNNCell, ModifierCell, HybridRecurrentCell
 from ...rnn.rnn_cell import _format_sequence, _get_begin_state, _mask_sequence_variable_length
+from ...rnn.rnn_cell import RNNCell as GluonRNNCell
 from ... import tensor_types
 
 class VariationalDropoutCell(ModifierCell):
@@ -315,3 +321,144 @@ def hybrid_forward(self, F, inputs, states, i2h_weight,
 
         return next_r, [next_r, next_c]
     # pylint: enable= arguments-differ
+
+class SymHybridRNNCell(HybridRecurrentCell):
+    def __init__(self, prefix=None, params=None):
+        super(SymHybridRNNCell, self).__init__(prefix=prefix, params=params)
+
+    def unroll(self, inputs, begin_state=None, layout='NTC',
+               merge_outputs=None, valid_length=None):
+        # if this is a list, we can have unroll in the parent class to handle it.
+        if (isinstance(inputs, list)):
+            return super(SymHybridRNNCell, self).unroll(self, len(inputs), inputs, begin_state,
+                                                        layout, merge_outputs, valid_length)
+        elif (isinstance(inputs, ndarray.NDArray)):
+            axis = layout.find('T')
+            length = inputs.shape[axis]
+            return super(SymHybridRNNCell, self).unroll(self, length, inputs, begin_state,
+                                                        layout, merge_outputs, valid_length)
+
+        self.reset()
+
+        batch_size = 0
+        F = symbol
+        axis = layout.find('T')
+        begin_state = _get_begin_state(self, F, begin_state, inputs, batch_size)
+
+        states = begin_state
+        outputs = []
+        all_states = []
+        def iter_func(input, states):
+            return self(input, states)
+        outputs, last_states = foreach(iter_func, inputs, begin_state)
+        #if valid_length is not None:
+        #    states = [F.SequenceLast(ele_list,
+        #                             sequence_length=valid_length,
+        #                             use_sequence_length=True,
+        #                             axis=0)
+        #              for ele_list in all_states]
+        #    outputs = F.SequenceMask(outputs, sequence_length=valid_length, use_sequence_length=True,
+        #                             axis=axis)
+        #outputs, _, _, _ = _format_sequence(length, outputs, layout, merge_outputs)
+
+        return outputs, last_states
+
+class RNNCell(SymHybridRNNCell):
+    r"""Elman RNN recurrent neural network cell.
+
+    Each call computes the following function:
+
+    .. math::
+
+        h_t = \tanh(w_{ih} * x_t + b_{ih}  +  w_{hh} * h_{(t-1)} + b_{hh})
+
+    where :math:`h_t` is the hidden state at time `t`, and :math:`x_t` is the hidden
+    state of the previous layer at time `t` or :math:`input_t` for the first layer.
+    If nonlinearity='relu', then `ReLU` is used instead of `tanh`.
+
+    Parameters
+    ----------
+    hidden_size : int
+        Number of units in output symbol
+    activation : str or Symbol, default 'tanh'
+        Type of activation function.
+    i2h_weight_initializer : str or Initializer
+        Initializer for the input weights matrix, used for the linear
+        transformation of the inputs.
+    h2h_weight_initializer : str or Initializer
+        Initializer for the recurrent weights matrix, used for the linear
+        transformation of the recurrent state.
+    i2h_bias_initializer : str or Initializer
+        Initializer for the bias vector.
+    h2h_bias_initializer : str or Initializer
+        Initializer for the bias vector.
+    prefix : str, default 'rnn_'
+        Prefix for name of `Block`s
+        (and name of weight if params is `None`).
+    params : Parameter or None
+        Container for weight sharing between cells.
+        Created if `None`.
+
+
+    Inputs:
+        - **data**: input tensor with shape `(batch_size, input_size)`.
+        - **states**: a list of one initial recurrent state tensor with shape
+          `(batch_size, num_hidden)`.
+
+    Outputs:
+        - **out**: output tensor with shape `(batch_size, num_hidden)`.
+        - **next_states**: a list of one output recurrent state tensor with the
+          same shape as `states`.
+    """
+    def __init__(self, hidden_size, activation='tanh',
+                 i2h_weight_initializer=None, h2h_weight_initializer=None,
+                 i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
+                 input_size=0, prefix=None, params=None):
+        super(RNNCell, self).__init__(prefix=prefix, params=params)
+        self._hidden_size = hidden_size
+        self._activation = activation
+        self._input_size = input_size
+        self.i2h_weight = self.params.get('i2h_weight', shape=(hidden_size, input_size),
+                                          init=i2h_weight_initializer,
+                                          allow_deferred_init=True)
+        self.h2h_weight = self.params.get('h2h_weight', shape=(hidden_size, hidden_size),
+                                          init=h2h_weight_initializer,
+                                          allow_deferred_init=True)
+        self.i2h_bias = self.params.get('i2h_bias', shape=(hidden_size,),
+                                        init=i2h_bias_initializer,
+                                        allow_deferred_init=True)
+        self.h2h_bias = self.params.get('h2h_bias', shape=(hidden_size,),
+                                        init=h2h_bias_initializer,
+                                        allow_deferred_init=True)
+
+    def state_info(self, batch_size=0):
+        return [{'shape': (batch_size, self._hidden_size), '__layout__': 'NC'}]
+
+    def _alias(self):
+        return 'rnn'
+
+    def __repr__(self):
+        s = '{name}({mapping}'
+        if hasattr(self, '_activation'):
+            s += ', {_activation}'
+        s += ')'
+        shape = self.i2h_weight.shape
+        mapping = '{0} -> {1}'.format(shape[1] if shape[1] else None, shape[0])
+        return s.format(name=self.__class__.__name__,
+                        mapping=mapping,
+                        **self.__dict__)
+
+    def hybrid_forward(self, F, inputs, states, i2h_weight,
+                       h2h_weight, i2h_bias, h2h_bias):
+        prefix = 't%d_'%self._counter
+        i2h = F.FullyConnected(data=inputs, weight=i2h_weight, bias=i2h_bias,
+                               num_hidden=self._hidden_size,
+                               name=prefix+'i2h')
+        h2h = F.FullyConnected(data=states[0], weight=h2h_weight, bias=h2h_bias,
+                               num_hidden=self._hidden_size,
+                               name=prefix+'h2h')
+        output = self._get_activation(F, i2h + h2h, self._activation,
+                                      name=prefix+'out')
+
+        print("contrib.RNNCell")
+        return output, [output]
diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index 169a9d47e7cf..d2f6a3616c4b 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -28,13 +28,25 @@ def test_rnn():
     inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(3)]
     outputs, _ = cell.unroll(3, inputs)
     outputs = mx.sym.Group(outputs)
-    assert sorted(cell.collect_params().keys()) == ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']
+    assert sorted(cell.collect_params().keys()) == ['rnn_h2h_bias', 'rnn_h2h_weight',
+                                                    'rnn_i2h_bias', 'rnn_i2h_weight']
     assert outputs.list_outputs() == ['rnn_t0_out_output', 'rnn_t1_out_output', 'rnn_t2_out_output']
 
     args, outs, auxs = outputs.infer_shape(rnn_t0_data=(10,50), rnn_t1_data=(10,50), rnn_t2_data=(10,50))
     assert outs == [(10, 100), (10, 100), (10, 100)]
 
 
+def test_contrib_rnn():
+    contrib_cell = gluon.contrib.rnn.RNNCell(100, prefix='rnn_')
+    inputs = mx.sym.Variable('rnn_data')
+    contrib_outputs, _ = contrib_cell.unroll(inputs)
+    assert sorted(contrib_cell.collect_params().keys()) == ['rnn_h2h_bias', 'rnn_h2h_weight',
+                                                            'rnn_i2h_bias', 'rnn_i2h_weight']
+
+    args, outs, auxs = contrib_outputs.infer_shape(rnn_data=(3, 10,50))
+    assert outs == [(3, 10, 100)]
+
+
 def test_lstm():
     cell = gluon.rnn.LSTMCell(100, prefix='rnn_')
     inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(3)]

From b98e06d22ec43149ee820eddf92f25753a51f5bd Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Mon, 16 Apr 2018 21:13:47 +0000
Subject: [PATCH 022/135] print unnecessary print.

---
 python/mxnet/gluon/contrib/rnn/rnn_cell.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/mxnet/gluon/contrib/rnn/rnn_cell.py b/python/mxnet/gluon/contrib/rnn/rnn_cell.py
index 7cd2d0a4a98f..27871a56e315 100644
--- a/python/mxnet/gluon/contrib/rnn/rnn_cell.py
+++ b/python/mxnet/gluon/contrib/rnn/rnn_cell.py
@@ -460,5 +460,4 @@ def hybrid_forward(self, F, inputs, states, i2h_weight,
         output = self._get_activation(F, i2h + h2h, self._activation,
                                       name=prefix+'out')
 
-        print("contrib.RNNCell")
         return output, [output]

From fc575fe143bb0677d8dad1e7a8dde1d91944da78 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Mon, 16 Apr 2018 23:26:26 +0000
Subject: [PATCH 023/135] have imperative and symbolic foreach.

---
 python/mxnet/contrib/control_flow.py       | 109 ---------------------
 python/mxnet/gluon/contrib/rnn/rnn_cell.py |  18 ++--
 python/mxnet/ndarray/contrib.py            |  15 +++
 python/mxnet/symbol/contrib.py             |  93 ++++++++++++++++++
 tests/python/unittest/test_gluon_rnn.py    |  18 ++++
 tests/python/unittest/test_operator.py     |   4 +-
 6 files changed, 136 insertions(+), 121 deletions(-)
 delete mode 100644 python/mxnet/contrib/control_flow.py

diff --git a/python/mxnet/contrib/control_flow.py b/python/mxnet/contrib/control_flow.py
deleted file mode 100644
index df3716040aa9..000000000000
--- a/python/mxnet/contrib/control_flow.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import ctypes
-
-from .. import symbol
-from ..base import _LIB, c_str, c_array, check_call
-from ..base import SymbolHandle, NDArrayHandle
-from ..attribute import AttrScope
-
-def _get_graph_inputs(subg, name, prefix):
-    num_handles = ctypes.c_int(1000)
-    handles = c_array(SymbolHandle, [SymbolHandle(0) for i in range(1000)])
-    check_call(_LIB.MXSymbolGetInputSymbols(subg.handle, handles,
-        ctypes.byref(num_handles)))
-
-    syms = []
-    for i in range(num_handles.value):
-        s = symbol.Symbol(handles[i])
-        syms.append(s)
-    return syms
-
-def foreach(func, input, init_states, back_prop=False, name="foreach"):
-    assert isinstance(init_states, list), "init_states should be a list"
-    states = []
-    with AttrScope(subgraph_name=name):
-        in_ele = symbol.var("in")
-        for s in init_states:
-            states.append(symbol.var(s.name))
-
-        sym_out = func(in_ele, states)
-        # The function should return a tuple. The first element goes to
-        # the output of the function. The second element is a list.
-        assert isinstance(sym_out, tuple), "func should return a tuple (out, states)"
-        assert isinstance(sym_out[1], list), \
-                "the second element in the returned tuple should be a list"
-        assert len(sym_out[1]) == len(init_states), \
-                "the number of output states (%d) should be the same as input states (%d)" \
-                % (len(sym_out[1]), len(init_states))
-
-        if (isinstance(sym_out[0], list)):
-            flat_out = sym_out[0]
-        else:
-            flat_out = [sym_out[0]]
-        for s in sym_out[1]:
-            # There is a problem if the outputs are the same as the inputs
-            # or the first output.
-            # TODO this is a temp fix.
-            flat_out.append(symbol.identity(s))
-    g = symbol.Group(flat_out)
-    input_syms = _get_graph_inputs(g, name, "ro_var")
-
-    if (isinstance(input, list)):
-        num_inputs = len(input)
-    else:
-        num_inputs = 1
-
-    # Here we need to find out how the input symbols are ordered as well as
-    # where the loop states are located in the list of inputs.
-
-    # This dict contains the symbols of the subgraph.
-    input_syms = {sym.name:sym for sym in input_syms}
-    gin_names = input_syms.keys()
-    # This array contains the symbols for the inputs of foreach.
-    ordered_ins = []
-    states_map = {sym.name:sym for sym in init_states}
-    state_names = states_map.keys()
-    in_state_locs = [-1] * len(init_states)
-    for in_name in g.list_inputs():
-        assert in_name in gin_names, "The input variable %s can't be found in graph inputs: %s" \
-                % (in_name, str(gin_names))
-        if (in_name in state_names):
-            ordered_ins.append(states_map[in_name])
-        elif (in_name != "in"):
-            ordered_ins.append(input_syms[in_name])
-
-        for i in range(len(init_states)):
-            if (init_states[i].name == in_name):
-                in_state_locs[i] = len(ordered_ins) - 1 + num_inputs
-
-    num_outputs = len(flat_out)
-    num_states = len(state_names)
-    ret = symbol._internal._foreach(g, input, *ordered_ins, num_outputs=num_outputs,
-                                    in_state_locs=in_state_locs)
-    if (num_outputs - num_states > 1):
-        outs = []
-        for i in range(num_outputs - num_states):
-            outs.append(ret[i])
-    else:
-        outs = ret[0]
-    states = []
-    for i in range(num_states):
-        states.append(ret[num_outputs - num_states + i])
-
-    return (outs, states)
diff --git a/python/mxnet/gluon/contrib/rnn/rnn_cell.py b/python/mxnet/gluon/contrib/rnn/rnn_cell.py
index 27871a56e315..98b89d22d5c3 100644
--- a/python/mxnet/gluon/contrib/rnn/rnn_cell.py
+++ b/python/mxnet/gluon/contrib/rnn/rnn_cell.py
@@ -23,7 +23,6 @@
 
 from .... import symbol, ndarray
 from ....base import _as_list
-from ....contrib.control_flow import foreach
 from ...rnn import BidirectionalCell, SequentialRNNCell, ModifierCell, HybridRecurrentCell
 from ...rnn.rnn_cell import _format_sequence, _get_begin_state, _mask_sequence_variable_length
 from ...rnn.rnn_cell import RNNCell as GluonRNNCell
@@ -332,17 +331,16 @@ def unroll(self, inputs, begin_state=None, layout='NTC',
         if (isinstance(inputs, list)):
             return super(SymHybridRNNCell, self).unroll(self, len(inputs), inputs, begin_state,
                                                         layout, merge_outputs, valid_length)
-        elif (isinstance(inputs, ndarray.NDArray)):
-            axis = layout.find('T')
-            length = inputs.shape[axis]
-            return super(SymHybridRNNCell, self).unroll(self, length, inputs, begin_state,
-                                                        layout, merge_outputs, valid_length)
 
         self.reset()
-
-        batch_size = 0
-        F = symbol
+        batch_axis = layout.find('N')
         axis = layout.find('T')
+        batch_size = 0
+        if isinstance(inputs, symbol.Symbol):
+            F = symbol
+        else:
+            batch_size = inputs.shape[batch_axis]
+            F = ndarray
         begin_state = _get_begin_state(self, F, begin_state, inputs, batch_size)
 
         states = begin_state
@@ -350,7 +348,7 @@ def unroll(self, inputs, begin_state=None, layout='NTC',
         all_states = []
         def iter_func(input, states):
             return self(input, states)
-        outputs, last_states = foreach(iter_func, inputs, begin_state)
+        outputs, last_states = F.contrib.foreach(iter_func, inputs, begin_state)
         #if valid_length is not None:
         #    states = [F.SequenceLast(ele_list,
         #                             sequence_length=valid_length,
diff --git a/python/mxnet/ndarray/contrib.py b/python/mxnet/ndarray/contrib.py
index cc66483f00b3..a1ec2b0f2552 100644
--- a/python/mxnet/ndarray/contrib.py
+++ b/python/mxnet/ndarray/contrib.py
@@ -95,3 +95,18 @@ def rand_zipfian(true_classes, num_sampled, range_max, ctx=None):
     expected_count_sampled = expected_prob_sampled * num_sampled
     return sampled_classes, expected_count_true, expected_count_sampled
 # pylint: enable=line-too-long
+
+def foreach(func, input, init_states, back_prop=False, name="foreach"):
+    assert isinstance(init_states, list), "init_states should be a list"
+    states = init_states
+    outputs = []
+    for i in range(input.shape[0]):
+        ele = input[i]
+        outs, states = func(ele, states)
+        outs = _as_list(outs)
+        if (i == 0):
+            outputs = outs
+        else:
+            for j in range(outs):
+                outputs[j].append(outs[j])
+    return (outputs, states)
diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 83e90e687327..856591ca0c09 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -26,6 +26,13 @@
 except ImportError:
     pass
 
+import ctypes
+
+from . import symbol
+from ..base import _LIB, c_str, c_array, check_call
+from ..base import SymbolHandle, NDArrayHandle
+from ..attribute import AttrScope
+
 __all__ = ["rand_zipfian"]
 
 def rand_zipfian(true_classes, num_sampled, range_max):
@@ -91,3 +98,89 @@ def rand_zipfian(true_classes, num_sampled, range_max):
     expected_prob_sampled = ((sampled_cls_fp64 + 2.0) / (sampled_cls_fp64 + 1.0)).log() / log_range
     expected_count_sampled = expected_prob_sampled * num_sampled
     return sampled_classes, expected_count_true, expected_count_sampled
+
+def _get_graph_inputs(subg, name, prefix):
+    num_handles = ctypes.c_int(1000)
+    handles = c_array(SymbolHandle, [SymbolHandle(0) for i in range(1000)])
+    check_call(_LIB.MXSymbolGetInputSymbols(subg.handle, handles,
+        ctypes.byref(num_handles)))
+
+    syms = []
+    for i in range(num_handles.value):
+        s = Symbol(handles[i])
+        syms.append(s)
+    return syms
+
+def foreach(func, input, init_states, back_prop=False, name="foreach"):
+    assert isinstance(init_states, list), "init_states should be a list"
+    states = []
+    with AttrScope(subgraph_name=name):
+        in_ele = symbol.var("in")
+        for s in init_states:
+            states.append(symbol.var(s.name))
+
+        sym_out = func(in_ele, states)
+        # The function should return a tuple. The first element goes to
+        # the output of the function. The second element is a list.
+        assert isinstance(sym_out, tuple), "func should return a tuple (out, states)"
+        assert isinstance(sym_out[1], list), \
+                "the second element in the returned tuple should be a list"
+        assert len(sym_out[1]) == len(init_states), \
+                "the number of output states (%d) should be the same as input states (%d)" \
+                % (len(sym_out[1]), len(init_states))
+
+        if (isinstance(sym_out[0], list)):
+            flat_out = sym_out[0]
+        else:
+            flat_out = [sym_out[0]]
+        for s in sym_out[1]:
+            # There is a problem if the outputs are the same as the inputs
+            # or the first output.
+            # TODO this is a temp fix.
+            flat_out.append(symbol.op.identity(s))
+    g = symbol.Group(flat_out)
+    input_syms = _get_graph_inputs(g, name, "ro_var")
+
+    if (isinstance(input, list)):
+        num_inputs = len(input)
+    else:
+        num_inputs = 1
+
+    # Here we need to find out how the input symbols are ordered as well as
+    # where the loop states are located in the list of inputs.
+
+    # This dict contains the symbols of the subgraph.
+    input_syms = {sym.name:sym for sym in input_syms}
+    gin_names = input_syms.keys()
+    # This array contains the symbols for the inputs of foreach.
+    ordered_ins = []
+    states_map = {sym.name:sym for sym in init_states}
+    state_names = states_map.keys()
+    in_state_locs = [-1] * len(init_states)
+    for in_name in g.list_inputs():
+        assert in_name in gin_names, "The input variable %s can't be found in graph inputs: %s" \
+                % (in_name, str(gin_names))
+        if (in_name in state_names):
+            ordered_ins.append(states_map[in_name])
+        elif (in_name != "in"):
+            ordered_ins.append(input_syms[in_name])
+
+        for i in range(len(init_states)):
+            if (init_states[i].name == in_name):
+                in_state_locs[i] = len(ordered_ins) - 1 + num_inputs
+
+    num_outputs = len(flat_out)
+    num_states = len(state_names)
+    ret = symbol._internal._foreach(g, input, *ordered_ins, num_outputs=num_outputs,
+                                    in_state_locs=in_state_locs)
+    if (num_outputs - num_states > 1):
+        outs = []
+        for i in range(num_outputs - num_states):
+            outs.append(ret[i])
+    else:
+        outs = ret[0]
+    states = []
+    for i in range(num_states):
+        states.append(ret[num_outputs - num_states + i])
+
+    return (outs, states)
diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index d2f6a3616c4b..f291733c5e5c 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -36,6 +36,14 @@ def test_rnn():
     assert outs == [(10, 100), (10, 100), (10, 100)]
 
 
+class RNNLayer(gluon.HybridBlock):
+    def __init__(self, prefix=None, params=None):
+        super(RNNLayer, self).__init__(prefix=prefix, params=params)
+        self.cell = gluon.contrib.rnn.RNNCell(100, prefix='rnn_')
+
+    def hybrid_forward(self, F, inputs, states=None):
+        return self.cell.unroll(inputs, states)
+
 def test_contrib_rnn():
     contrib_cell = gluon.contrib.rnn.RNNCell(100, prefix='rnn_')
     inputs = mx.sym.Variable('rnn_data')
@@ -46,6 +54,16 @@ def test_contrib_rnn():
     args, outs, auxs = contrib_outputs.infer_shape(rnn_data=(3, 10,50))
     assert outs == [(3, 10, 100)]
 
+    rnn_data = mx.nd.normal(loc=0, scale=1, shape=(3, 10, 50))
+    layer = RNNLayer()
+    layer.initialize(ctx=mx.cpu(0))
+    res1 = layer(rnn_data)
+
+    layer = RNNLayer()
+    layer.initialize(ctx=mx.cpu(0))
+    layer.hybridize()
+    res2 = layer(rnn_data)
+
 
 def test_lstm():
     cell = gluon.rnn.LSTMCell(100, prefix='rnn_')
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 892418e48caf..0389744003f4 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5947,7 +5947,7 @@ def step(in1, states):
         out = in1 + states[0]
         return (out, [out])
 
-    out = mx.contrib.cf.foreach(step, v3, [v4])
+    out = mx.sym.contrib.foreach(step, v3, [v4])
     out1 = out[0] * 2
     out = mx.sym.Group([out1, out[1][0]])
     arr1 = mx.nd.random.uniform(shape=(5, 2))
@@ -6004,7 +6004,7 @@ def sym_group(out):
     i2h_barr = mx.nd.random.uniform(shape=(16))
     h2h_barr = mx.nd.random.uniform(shape=(16))
 
-    out = mx.contrib.cf.foreach(step, data, [init_h, init_c])
+    out = mx.sym.contrib.foreach(step, data, [init_h, init_c])
     out = sym_group(out)
     e = out.bind(ctx=mx.cpu(), args={'data': data_arr, 'h': h_arr, 'c': c_arr,
         'i2h_weight': i2h_warr, 'h2h_weight': h2h_warr, 'i2h_bias': i2h_barr, 'h2h_bias': h2h_barr})

From 37da6fbd37ca6f965b523570270c32ee83f524b0 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 18 Apr 2018 01:57:42 +0000
Subject: [PATCH 024/135] Fix an error after moving foreach.

---
 python/mxnet/contrib/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/mxnet/contrib/__init__.py b/python/mxnet/contrib/__init__.py
index 7489d97d90fe..fbfd3469678b 100644
--- a/python/mxnet/contrib/__init__.py
+++ b/python/mxnet/contrib/__init__.py
@@ -32,4 +32,3 @@
 from . import io
 from . import quantization
 from . import quantization as quant
-from . import control_flow as cf

From f41235c24a0ba40a6c82346ab26e7031c9a36fe1 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 18 Apr 2018 01:58:28 +0000
Subject: [PATCH 025/135] Fix imperative foreach

---
 python/mxnet/ndarray/contrib.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/python/mxnet/ndarray/contrib.py b/python/mxnet/ndarray/contrib.py
index a1ec2b0f2552..0697f5ac2bbd 100644
--- a/python/mxnet/ndarray/contrib.py
+++ b/python/mxnet/ndarray/contrib.py
@@ -21,6 +21,8 @@
 import math
 from ..context import current_context
 from ..random import uniform
+from ..base import _as_list
+from .op import stack
 try:
     from .gen_contrib import *
 except ImportError:
@@ -105,8 +107,12 @@ def foreach(func, input, init_states, back_prop=False, name="foreach"):
         outs, states = func(ele, states)
         outs = _as_list(outs)
         if (i == 0):
-            outputs = outs
+            # outputs is a list of lists
+            for j in range(len(outs)):
+                outputs.append([outs[j]])
         else:
-            for j in range(outs):
+            for j in range(len(outs)):
                 outputs[j].append(outs[j])
+    for i in range(len(outputs)):
+        outputs[i] = stack(*outputs[i])
     return (outputs, states)

From 214c1c2fab72b5cda7f01bc9d8ff890af12811ae Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 24 Apr 2018 02:10:42 +0000
Subject: [PATCH 026/135] Fix a minor problem.

---
 python/mxnet/gluon/contrib/rnn/rnn_cell.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mxnet/gluon/contrib/rnn/rnn_cell.py b/python/mxnet/gluon/contrib/rnn/rnn_cell.py
index 98b89d22d5c3..dcb396a57613 100644
--- a/python/mxnet/gluon/contrib/rnn/rnn_cell.py
+++ b/python/mxnet/gluon/contrib/rnn/rnn_cell.py
@@ -325,7 +325,7 @@ class SymHybridRNNCell(HybridRecurrentCell):
     def __init__(self, prefix=None, params=None):
         super(SymHybridRNNCell, self).__init__(prefix=prefix, params=params)
 
-    def unroll(self, inputs, begin_state=None, layout='NTC',
+    def unroll(self, inputs, begin_state=None, layout='TNC',
                merge_outputs=None, valid_length=None):
         # if this is a list, we can have unroll in the parent class to handle it.
         if (isinstance(inputs, list)):

From 9aabc7478aa0c589e7729ada798c2bb491041838 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Mon, 30 Apr 2018 18:56:46 +0000
Subject: [PATCH 027/135] Use CachedOp to execute subgraph.

---
 src/operator/nn/control_flow.cc | 174 ++++----------------------------
 1 file changed, 18 insertions(+), 156 deletions(-)

diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 9ada9f105624..fd83d3b07363 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -30,132 +30,21 @@
 namespace mxnet {
 namespace op {
 
-void RunGraph(const nnvm::IndexedGraph& idx,
-    const std::vector<NDArray*> arrays,
-    size_t node_start, size_t node_end,
-    std::vector<OpReqType>&& array_reqs,
-    std::vector<uint32_t>&& ref_count,
-    std::vector<OpStatePtr> *p_states,
-    const DispatchModeVector &dispatch_modes) {
-  using namespace nnvm;
-  using namespace imperative;
-  static auto& createop = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
-  static auto& is_layer_backward = Op::GetAttr<bool>("TIsLayerOpBackward");
-
-  std::vector<OpStatePtr>& states = *p_states;
-  std::vector<NDArray*> ndinputs, ndoutputs;
-  ShapeVector arg_shapes;
-  DTypeVector arg_dtypes;
-  std::vector<OpReqType> req;
-
-  for (size_t i = node_start; i < node_end; ++i) {
-    const nnvm::IndexedGraph::Node& node = idx[i];
-    if (node.source->op() == nullptr) continue;
-    auto num_outputs = node.source->num_outputs();
-    ndinputs.clear();
-    ndinputs.reserve(node.inputs.size());
-    for (const auto& j : node.inputs) {
-      ndinputs.emplace_back(arrays[idx.entry_id(j)]);
-      CHECK(!ndinputs.back()->is_none()) << idx[j.node_id].source->attrs.name
-          << " " << j.index;
-    }
-    ndoutputs.clear();
-    ndoutputs.reserve(num_outputs);
-    req.clear();
-    req.reserve(num_outputs);
-    for (size_t j = 0; j < num_outputs; ++j) {
-      size_t eid = idx.entry_id(i, j);
-      ndoutputs.emplace_back(arrays[eid]);
-      req.push_back(array_reqs[eid]);
-      CHECK(!ndoutputs.back()->is_none());
-    }
-    const Context& ctx = ndoutputs[0]->ctx();
-    const DispatchMode dispatch_mode = dispatch_modes[i];
-    if (createop.count(node.source->op())) {
-      arg_shapes.clear();
-      arg_dtypes.clear();
-      arg_shapes.reserve(ndinputs.size());
-      arg_dtypes.reserve(ndinputs.size());
-      for (size_t i = 0; i < ndinputs.size(); ++i) {
-        arg_shapes.emplace_back(ndinputs[i]->shape());
-        arg_dtypes.emplace_back(ndinputs[i]->dtype());
-      }
-      states[i] = createop[node.source->op()](
-          node.source->attrs, ctx, arg_shapes, arg_dtypes);
-      Imperative::InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs, req,
-                           dispatch_mode, states[i]);
-    } else if (is_layer_backward.get(node.source->op(), false)) {
-      nnvm::Node* fwd_node = node.source->control_deps[0].get();
-      auto fwd_node_id = idx.node_id(fwd_node);
-      Imperative::InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs,
-                           req, dispatch_mode, states[fwd_node_id]);
-    } else {
-      Imperative::InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs,
-                           req, dispatch_mode);
-    }
-  }
-}
-
-static void ExecSubgraph(nnvm::Graph &g, const OpContext& ctx,
-                         const std::vector<NDArray>& cinputs,
+static void ExecSubgraph(nnvm::Symbol &sym, const OpContext& ctx,
+                         std::vector<NDArray> cinputs,
                          const std::vector<OpReqType>& req,
-                         const std::vector<NDArray>& coutputs) {
+                         std::vector<NDArray> coutputs) {
   using namespace nnvm;
   using namespace imperative;
-  const auto& idx = g.indexed_graph();
-  size_t num_inputs = idx.input_nodes().size();
-
-  CHECK_EQ(num_inputs, cinputs.size())
-      << "The subgraph requires " << num_inputs << " but got " << cinputs.size();
 
-  Context default_ctx = cinputs[0].ctx();
-  for (size_t i = 0; i < cinputs.size(); ++i) {
-    CHECK_EQ(cinputs[i].ctx(), default_ctx)
-        << "The subgraph requires all inputs to live on the same context. But "
-        << idx[idx.input_nodes()[0]].source->attrs.name << " is on " << default_ctx
-        << " while " << idx[idx.input_nodes()[i]].source->attrs.name << " is on "
-        << cinputs[i].ctx();
-  }
-
-  // TODO(zhengda) we might want to buffer them.
-  std::vector<NDArray> buff;
-  std::vector<OpStatePtr> states;
-  std::vector<NDArray> inputs = cinputs;
-  std::vector<NDArray> outputs = coutputs;
-
-  // Allocate entries
-  states.resize(idx.num_nodes());
-  buff.resize(idx.num_node_entries());
-  states.reserve(idx.num_nodes());
-  std::vector<NDArray*> arrays;
-  arrays.reserve(buff.size());
-  for (size_t i = 0; i < buff.size(); ++i) arrays.push_back(&buff[i]);
-  for (size_t i = 0; i < num_inputs; ++i) {
-    arrays[idx.entry_id(idx.input_nodes()[i], 0)] = &inputs[i];
-  }
-  for (size_t i = 0; i < idx.outputs().size(); ++i) {
-    auto eid = idx.entry_id(idx.outputs()[i]);
-    if (!arrays[eid]->is_none()) outputs[i] = arrays[eid]->Detach();
-    arrays[eid] = &outputs[i];
-  }
-
-  // Allocate memory for the NDArrays
-  std::vector<uint32_t> ref_count = g.GetAttr<std::vector<uint32_t> >(
-      ctx.is_train ? "full_ref_count" : "forward_ref_count");
-
-  std::vector<OpReqType> array_reqs(arrays.size(), kWriteTo);
-  for (size_t i = 0; i < idx.num_node_entries(); ++i) {
-    if (ref_count[i] == 0) array_reqs[i] = kNullOp;
-  }
-
-  const auto& mem_plan = g.GetAttr<MemoryPlanVector>(
-      ctx.is_train ? "full_mem_plan" : "forward_mem_plan");
-  AllocateMemory(g, idx, default_ctx, 0, idx.num_node_entries(),
-                 mem_plan, arrays, &array_reqs);
-
-  const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
-  RunGraph(idx, arrays, 0, idx.num_nodes(), std::move(array_reqs),
-      std::move(ref_count), &states, dispatch_modes);
+  std::vector<NDArray *> inputs(cinputs.size());
+  std::vector<NDArray *> outputs(coutputs.size());
+  for (size_t i = 0; i < inputs.size(); i++)
+    inputs[i] = &cinputs[i];
+  for (size_t i = 0; i < outputs.size(); i++)
+    outputs[i] = &coutputs[i];
+  Imperative::CachedOp op(sym, std::vector<std::pair<std::string, std::string> >());
+  op.Forward(nullptr, inputs, outputs);
 }
 
 struct ForeachParam : public dmlc::Parameter<ForeachParam> {
@@ -205,33 +94,6 @@ static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,
   const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
   CHECK_EQ(outputs.size(), (size_t) params.num_outputs);
   CHECK_EQ(attrs.subgraphs.size(), 1U);
-  nnvm::Graph &g = *attrs.subgraphs[0];
-  const auto& idx = g.indexed_graph();
-
-  // If this is inference, we only need the forward memory plan.
-  bool has_mem_plan = !ctx.is_train && g.attrs.count("forward_mem_plan");
-  // If this is training, we need the full memory plan.
-  has_mem_plan = has_mem_plan || (ctx.is_train && g.attrs.count("full_mem_plan"));
-  // If we don't have a memory plan yet, we need to create a memory plan.
-  if (!has_mem_plan) {
-    nnvm::StorageVector storage(idx.num_node_entries(), exec::kBadStorageID);
-    for (const auto i : idx.input_nodes())
-      storage[idx.entry_id(i, 0)] = exec::kExternalStorageID;
-    const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
-    CHECK_EQ(stypes.size(), storage.size());
-    for (size_t i = 0; i < stypes.size(); i++) {
-      if (stypes[i] != kDefaultStorage)
-        storage[i] = exec::kDynamicStorageID;
-    }
-
-    auto mem_plan = imperative::PlanMemory(
-        &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >(
-          ctx.is_train ? "full_ref_count" : "forward_ref_count"));
-    // TODO(zhengda) we need to be careful of changing graph attributes.
-    // It's not thread-safe.
-    g.attrs[ctx.is_train ? "full_mem_plan" : "forward_mem_plan"]
-      = std::make_shared<dmlc::any>(std::move(mem_plan));
-  }
   size_t len = inputs[0].shape()[0];
   CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
 
@@ -284,7 +146,7 @@ static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,
       }
     }
 
-    ExecSubgraph(g, ctx, subg_inputs, req, *subg_out_curr);
+    ExecSubgraph(*attrs.subgraphs[0], ctx, subg_inputs, req, *subg_out_curr);
     // We need to wait for the iteration to complete before executing
     // the next one or return from the loop. In this way, we can reuse
     // the memory in the subgraph.
@@ -302,8 +164,8 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
   // foreach iterates over the first input NDArray over the first dimension.
   shape_inputs[0] = TShape(in_shape->at(0).begin() + 1, in_shape->at(0).end());
   CHECK_EQ(attrs.subgraphs.size(), 1U);
-  auto g = attrs.subgraphs[0];
-  CHECK(g);
+  auto g = std::make_shared<nnvm::Graph>();
+  g->outputs = attrs.subgraphs[0]->outputs;
   const auto& idx = g->indexed_graph();
   CHECK_EQ(idx.input_nodes().size(), in_shape->size());
   CHECK_EQ(idx.outputs().size(), out_shape->size());
@@ -346,8 +208,8 @@ static bool ForeachType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_type->size(), (size_t) params.num_outputs);
   nnvm::DTypeVector dtype_inputs = *in_type;
   CHECK_EQ(attrs.subgraphs.size(), 1U);
-  auto g = attrs.subgraphs[0];
-  CHECK(g);
+  auto g = std::make_shared<nnvm::Graph>();
+  g->outputs = attrs.subgraphs[0]->outputs;
   const auto& idx = g->indexed_graph();
   CHECK_EQ(idx.input_nodes().size(), in_type->size());
   CHECK_EQ(idx.outputs().size(), out_type->size());
@@ -380,8 +242,8 @@ static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
   const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
   CHECK_EQ(out_attrs->size(), (size_t) params.num_outputs);
   CHECK_EQ(attrs.subgraphs.size(), 1U);
-  auto g = attrs.subgraphs[0];
-  CHECK(g);
+  auto g = std::make_shared<nnvm::Graph>();
+  g->outputs = attrs.subgraphs[0]->outputs;
   const auto& idx = g->indexed_graph();
   CHECK_EQ(idx.input_nodes().size(), in_attrs->size());
   CHECK_EQ(idx.outputs().size(), out_attrs->size());

From 7fc01559f69e95607145c7500090d3a558b773a1 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 1 May 2018 18:21:40 +0000
Subject: [PATCH 028/135] update TODO.

---
 src/operator/nn/control_flow.cc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index fd83d3b07363..4f76ccae139d 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -166,11 +166,10 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(attrs.subgraphs.size(), 1U);
   auto g = std::make_shared<nnvm::Graph>();
   g->outputs = attrs.subgraphs[0]->outputs;
+  // TODO(zhengda) We should avoid creating an index graph so many times.
   const auto& idx = g->indexed_graph();
   CHECK_EQ(idx.input_nodes().size(), in_shape->size());
   CHECK_EQ(idx.outputs().size(), out_shape->size());
-  // TODO(zhengda) This can also be called in the execution engine.
-  // We need to make it thread-safe.
   imperative::CheckAndInferShape(g.get(), std::move(shape_inputs), true);
   const auto& shapes = g->GetAttr<nnvm::ShapeVector>("shape");
 
@@ -210,11 +209,10 @@ static bool ForeachType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(attrs.subgraphs.size(), 1U);
   auto g = std::make_shared<nnvm::Graph>();
   g->outputs = attrs.subgraphs[0]->outputs;
+  // TODO(zhengda) We should avoid creating an index graph so many times.
   const auto& idx = g->indexed_graph();
   CHECK_EQ(idx.input_nodes().size(), in_type->size());
   CHECK_EQ(idx.outputs().size(), out_type->size());
-  // TODO(zhengda) This can also be called in the execution engine.
-  // We need to make it thread-safe.
   imperative::CheckAndInferType(g.get(), std::move(dtype_inputs), true);
 
   size_t num_input_arrays = 1;
@@ -244,6 +242,7 @@ static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(attrs.subgraphs.size(), 1U);
   auto g = std::make_shared<nnvm::Graph>();
   g->outputs = attrs.subgraphs[0]->outputs;
+  // TODO(zhengda) We should avoid creating an index graph so many times.
   const auto& idx = g->indexed_graph();
   CHECK_EQ(idx.input_nodes().size(), in_attrs->size());
   CHECK_EQ(idx.outputs().size(), out_attrs->size());

From 0d3613a7b2f2c8bd6876e94c9f154838f57e07a9 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 1 May 2018 23:33:59 +0000
Subject: [PATCH 029/135] make foreach op use FStatefulComputeEx.

TODO we need to change stateful executor to handle subgraph.
---
 src/operator/nn/control_flow.cc | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 4f76ccae139d..965dfa1b6560 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -86,14 +86,24 @@ static std::vector<T> ReorderInputs(const std::vector<T> &in, const nnvm::Indexe
   return ret;
 }
 
-static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,
+struct ForeachState {
+  Symbol subgraph;
+  ForeachParam params;
+
+  ForeachState(const Symbol &g, const ForeachParam &params) {
+    this->subgraph = g;
+    this->params = params;
+  }
+};
+
+static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
                                 const OpContext& ctx,
                                 const std::vector<NDArray>& inputs,
                                 const std::vector<OpReqType>& req,
                                 const std::vector<NDArray>& outputs) {
-  const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
+  ForeachState state = state_ptr.get_state<ForeachState>();
+  const ForeachParam& params = state.params;
   CHECK_EQ(outputs.size(), (size_t) params.num_outputs);
-  CHECK_EQ(attrs.subgraphs.size(), 1U);
   size_t len = inputs[0].shape()[0];
   CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
 
@@ -146,7 +156,7 @@ static void ForeachComputeExCPU(const nnvm::NodeAttrs& attrs,
       }
     }
 
-    ExecSubgraph(*attrs.subgraphs[0], ctx, subg_inputs, req, *subg_out_curr);
+    ExecSubgraph(state.subgraph, ctx, subg_inputs, req, *subg_out_curr);
     // We need to wait for the iteration to complete before executing
     // the next one or return from the loop. In this way, we can reuse
     // the memory in the subgraph.
@@ -271,6 +281,14 @@ static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+OpStatePtr CreateForeachState(const NodeAttrs& attrs,
+                              Context ctx,
+                              const std::vector<TShape>& ishape,
+                              const std::vector<int>& itype) {
+  const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
+  return OpStatePtr::Create<ForeachState>(*attrs.subgraphs[0], params);
+}
+
 NNVM_REGISTER_OP(_foreach)
 .describe(R"code(foreach)code" ADD_FILELINE)
 .set_attr_parser(ParamParser<ForeachParam>)
@@ -291,9 +309,10 @@ NNVM_REGISTER_OP(_foreach)
     [](const NodeAttrs& attrs) {
   return std::vector<uint32_t>{0};
 })
+.set_attr<FCreateOpState>("FCreateOpState", CreateForeachState)
 .set_attr<nnvm::FInferShape>("FInferShape", ForeachShape)
 .set_attr<nnvm::FInferType>("FInferType", ForeachType)
-.set_attr<FComputeEx>("FComputeEx<cpu>", ForeachComputeExCPU)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", ForeachComputeExCPU)
 .set_attr<std::string>("key_var_num_args", "num_args")
 .add_argument("fn", "Symbol", "Input graph.")
 .add_argument("input", "NDArray-or-Symbol", "The input array where we iterate over.")

From f33d0f4ae18df7074c9ff2d6df1d83ad1f6e86be Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 2 May 2018 22:08:23 +0000
Subject: [PATCH 030/135] Add backward.

---
 include/mxnet/ndarray.h                |   4 +
 src/operator/nn/control_flow.cc        | 222 ++++++++++++++++++++++---
 tests/python/unittest/test_operator.py |  41 +++--
 3 files changed, 230 insertions(+), 37 deletions(-)

diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index d3f44404fd82..e88068537d0c 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -703,6 +703,10 @@ class NDArray {
   NDArray MKLDNNDataReshape(const TShape &shape) const;
 #endif
 
+  const nnvm::NodeEntry &GetAutogradEntry() const {
+    return entry_;
+  }
+
   /*!
    * \brief Save list of ndarray into the Stream.x
    * \param fo The stream of output.
diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 965dfa1b6560..e84b35267cb8 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -25,28 +25,12 @@
 #include <dmlc/logging.h>
 #include <dmlc/optional.h>
 #include "../operator_common.h"
+#include "../elemwise_op_common.h"
 #include "../../imperative/imperative_utils.h"
 
 namespace mxnet {
 namespace op {
 
-static void ExecSubgraph(nnvm::Symbol &sym, const OpContext& ctx,
-                         std::vector<NDArray> cinputs,
-                         const std::vector<OpReqType>& req,
-                         std::vector<NDArray> coutputs) {
-  using namespace nnvm;
-  using namespace imperative;
-
-  std::vector<NDArray *> inputs(cinputs.size());
-  std::vector<NDArray *> outputs(coutputs.size());
-  for (size_t i = 0; i < inputs.size(); i++)
-    inputs[i] = &cinputs[i];
-  for (size_t i = 0; i < outputs.size(); i++)
-    outputs[i] = &coutputs[i];
-  Imperative::CachedOp op(sym, std::vector<std::pair<std::string, std::string> >());
-  op.Forward(nullptr, inputs, outputs);
-}
-
 struct ForeachParam : public dmlc::Parameter<ForeachParam> {
   int num_args;
   int dim;
@@ -89,19 +73,115 @@ static std::vector<T> ReorderInputs(const std::vector<T> &in, const nnvm::Indexe
 struct ForeachState {
   Symbol subgraph;
   ForeachParam params;
+  // These are output arrays from all iterations.
+  // They also contain the Op state for each CachedOp.
+  std::vector<std::vector<NDArray> > all_outputs;
+  std::vector<std::vector<NDArray> > all_inputs;
+  std::vector<std::vector<NDArray> > all_gradients;
+  std::vector<CachedOpPtr> iter_ops;
 
   ForeachState(const Symbol &g, const ForeachParam &params) {
     this->subgraph = g;
     this->params = params;
   }
+
+  void Forward(std::vector<NDArray> cinputs,
+               const std::vector<OpReqType>& req,
+               std::vector<NDArray> coutputs, bool is_recording);
+  void Backward(int iter_no, std::vector<NDArray> ograds,
+                const std::vector<OpReqType> &req,
+                std::vector<NDArray> igrads);
 };
 
+void ForeachState::Forward(std::vector<NDArray> cinputs,
+                           const std::vector<OpReqType>& req,
+                           std::vector<NDArray> coutputs, bool is_recording) {
+  using namespace nnvm;
+  using namespace imperative;
+
+  bool orig_is_record;
+  if (is_recording)
+    orig_is_record = Imperative::Get()->set_is_recording(true);
+  else
+    orig_is_record = Imperative::Get()->is_recording();
+
+  std::vector<NDArray *> inputs(cinputs.size());
+  std::vector<NDArray *> outputs(coutputs.size());
+  for (size_t i = 0; i < inputs.size(); i++)
+    inputs[i] = &cinputs[i];
+  for (size_t i = 0; i < outputs.size(); i++)
+    outputs[i] = &coutputs[i];
+
+  if (is_recording) {
+    all_inputs.push_back(cinputs);
+    std::vector<NDArray> gradients(cinputs.size());
+    std::vector<NDArray *> input_ptrs(cinputs.size());
+    std::vector<NDArray *> gradient_ptrs(cinputs.size());
+    std::vector<mx_uint> grad_reqs(cinputs.size());
+    for (size_t i = 0; i < gradients.size(); i++) {
+      gradients[i] = NDArray(cinputs[i].shape(), cinputs[i].ctx(),
+                             true, cinputs[i].dtype());
+      input_ptrs[i] = &cinputs[i];
+      gradient_ptrs[i] = &gradients[i];
+      grad_reqs[i] = kWriteTo;
+    }
+    Imperative::Get()->MarkVariables(input_ptrs, grad_reqs, gradient_ptrs);;
+  }
+
+  std::vector<std::pair<std::string, std::string> > kwargs;
+  kwargs.push_back(std::pair<std::string, std::string>("inline_limit", "0"));
+  CachedOpPtr op = std::make_shared<Imperative::CachedOp>(subgraph, kwargs);
+  // TODO here we only changed the output arrays in the arguments.
+  // Will this be a problem?
+  op->Forward(nullptr, inputs, outputs);
+
+  if (is_recording) {
+    // TODO does this have right inputs and outputs?
+    all_outputs.push_back(coutputs);
+    iter_ops.push_back(op);
+  }
+
+  Imperative::Get()->set_is_recording(orig_is_record);
+}
+
+void ForeachState::Backward(int iter_no, std::vector<NDArray> ograds,
+                            const std::vector<OpReqType> &req,
+                            std::vector<NDArray> igrads) {
+  using namespace nnvm;
+  using namespace imperative;
+
+  auto op = iter_ops[iter_no];
+  std::vector<NDArray *> inputs;
+  std::vector<NDArray *> outputs;
+  inputs.reserve(op->num_backward_inputs());
+  outputs.reserve(op->num_inputs());
+  for (size_t i = 0; i < ograds.size(); i++)
+    inputs.push_back(&ograds[i]);
+//  for (size_t i = 0; i < all_inputs[iter_no].size(); i++)
+//    inputs.push_back(&all_inputs[iter_no][i]);
+//  for (size_t i = 0; i < all_outputs[iter_no].size(); i++)
+//    inputs.push_back(&all_outputs[iter_no][i]);
+  CHECK_EQ(inputs.size(), op->num_backward_inputs());
+  for (size_t i = 0; i < igrads.size(); i++)
+    outputs.push_back(&igrads[i]);
+  CHECK_EQ(outputs.size(), op->num_inputs());
+
+  // TODO here we only changed the output arrays in the arguments.
+  // Will this be a problem?
+  CHECK(!Imperative::AGInfo::IsNone(all_outputs[iter_no][0]));
+  const nnvm::NodeEntry &node_entry = all_outputs[iter_no][0].GetAutogradEntry();
+  OpStatePtr state = Imperative::AGInfo::Get(node_entry.node).state;
+  op->Backward(false, state, inputs, req, outputs);
+}
+
+static bool is_recording = true;
+
 static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
                                 const OpContext& ctx,
                                 const std::vector<NDArray>& inputs,
                                 const std::vector<OpReqType>& req,
                                 const std::vector<NDArray>& outputs) {
-  ForeachState state = state_ptr.get_state<ForeachState>();
+  ForeachState &state = state_ptr.get_state<ForeachState>();
   const ForeachParam& params = state.params;
   CHECK_EQ(outputs.size(), (size_t) params.num_outputs);
   size_t len = inputs[0].shape()[0];
@@ -127,13 +207,13 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
   if (len % 2 == 1) {
     for (size_t i = 1; i < subg_outputs1.size(); i++) {
       subg_outputs1[i] = outputs[i];
-      subg_outputs2[i] = NDArray(outputs[i].shape(), outputs[i].ctx(), false,
+      subg_outputs2[i] = NDArray(outputs[i].shape(), outputs[i].ctx(), true,
                                  outputs[i].dtype());
     }
   } else {
     // Otherwise, we'll use the second set of outputs.
     for (size_t i = 1; i < subg_outputs1.size(); i++) {
-      subg_outputs1[i] = NDArray(outputs[i].shape(), outputs[i].ctx(), false,
+      subg_outputs1[i] = NDArray(outputs[i].shape(), outputs[i].ctx(), true,
                                  outputs[i].dtype());
       subg_outputs2[i] = outputs[i];
     }
@@ -143,9 +223,24 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
   for (size_t i = 0; i < len; i++) {
     std::vector<NDArray> *subg_out_curr = subg_outputs[i % 2];
     std::vector<NDArray> *subg_out_prev = subg_outputs[(i + 1) % 2];
+    // TODO it might be possible that the data won't be written to the output
+    // array directly.
     (*subg_out_curr)[0] = outputs[0].At(i);
+    // When recording for backward computation, we should make sure 
+    // that output arrays are actually different in each iteration.
+    if (is_recording && i < len - 1) {
+      for (size_t j = 1; j < subg_out_curr->size(); j++)
+        (*subg_out_curr)[j] = NDArray(outputs[j].shape(), outputs[j].ctx(),
+                                      true, outputs[j].dtype());
+    } else if (is_recording && i == len - 1) {
+      // For the last iteration, we need to write data to the output array
+      // directly.
+      for (size_t j = 1; j < subg_out_curr->size(); j++)
+        (*subg_out_curr)[j] = outputs[j];
+    }
 
     // Get a slice from the first input array.
+    // TODO how can we be sure that the first subgraph input is the data input?
     subg_inputs[0] = inputs[0].At(i);
     // For the rest of the iterations, the rest of the arguments are the outputs
     // from the previous iteration.
@@ -156,7 +251,7 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
       }
     }
 
-    ExecSubgraph(state.subgraph, ctx, subg_inputs, req, *subg_out_curr);
+    state.Forward(subg_inputs, req, *subg_out_curr, is_recording);
     // We need to wait for the iteration to complete before executing
     // the next one or return from the loop. In this way, we can reuse
     // the memory in the subgraph.
@@ -165,6 +260,51 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
   }
 }
 
+static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
+                                    const OpContext& ctx,
+                                    const std::vector<NDArray>& inputs,
+                                    const std::vector<OpReqType>& req,
+                                    const std::vector<NDArray>& outputs) {
+  ForeachState &state = state_ptr.get_state<ForeachState>();
+  const ForeachParam& params = state.params;
+  CHECK_EQ(outputs.size(), (size_t) params.num_args - 1);
+  // The inputs contain out gradients, inputs and outputs.
+  size_t len = inputs[0].shape()[0];
+  size_t num_input_data = 1;
+  size_t num_output_data = 1;
+
+  // In backward computation, we need to run iterations from backwards.
+  std::vector<NDArray> ograds(params.num_outputs);
+  std::vector<NDArray> igrads(params.num_args - 1);
+  for (size_t i = num_output_data; i < ograds.size(); i++)
+    ograds[i] = inputs[i];
+  for (int iter_num = len - 1; iter_num >= 0; iter_num--) {
+    ograds[0] = inputs[0].At(iter_num);
+    igrads[0] = outputs[0].At(iter_num);
+    if (iter_num == 0) {
+      for (size_t i = num_input_data; i < igrads.size(); i++)
+        igrads[i] = NDArray(outputs[i].shape(), outputs[i].ctx(),
+                            true, outputs[i].dtype());
+    } else {
+      for (size_t i = num_input_data; i < igrads.size(); i++)
+        igrads[i] = outputs[i];
+    }
+
+    // TODO is req correct here?
+    state.Backward(iter_num, ograds, req, igrads);
+
+    // We need to wait for the iteration to complete before executing
+    // the next one or return from the loop. In this way, we can reuse
+    // the memory in the subgraph.
+    for (size_t i = 0; i < igrads.size(); i++)
+      igrads[i].WaitToRead();
+
+    size_t num_states = ograds.size() - num_output_data;
+    for (size_t i = 0; i < num_states; i++)
+      ograds[i + num_output_data] = igrads[i + num_input_data];
+  }
+}
+
 static bool ForeachShape(const nnvm::NodeAttrs& attrs,
                          std::vector<TShape> *in_shape,
                          std::vector<TShape> *out_shape) {
@@ -281,14 +421,30 @@ static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-OpStatePtr CreateForeachState(const NodeAttrs& attrs,
-                              Context ctx,
-                              const std::vector<TShape>& ishape,
-                              const std::vector<int>& itype) {
+static bool BackwardForeachStorageType(const nnvm::NodeAttrs& attrs,
+                                       const int dev_mask,
+                                       DispatchMode* dispatch_mode,
+                                       std::vector<int> *in_attrs,
+                                       std::vector<int> *out_attrs) {
+  // TODO I need to set storage type properly.
+  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                             dispatch_mode, DispatchMode::kFComputeEx);
+}
+
+static OpStatePtr CreateForeachState(const NodeAttrs& attrs,
+                                     Context ctx,
+                                     const std::vector<TShape>& ishape,
+                                     const std::vector<int>& itype) {
   const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
   return OpStatePtr::Create<ForeachState>(*attrs.subgraphs[0], params);
 }
 
+void ForeachParamParser(nnvm::NodeAttrs* attrs) {
+  ParamParser<ForeachParam>(attrs);
+  // This is to indicate that the operator has a subgraph.
+  attrs->subgraphs.resize(1);
+}
+
 NNVM_REGISTER_OP(_foreach)
 .describe(R"code(foreach)code" ADD_FILELINE)
 .set_attr_parser(ParamParser<ForeachParam>)
@@ -309,6 +465,7 @@ NNVM_REGISTER_OP(_foreach)
     [](const NodeAttrs& attrs) {
   return std::vector<uint32_t>{0};
 })
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseInOut{"_backward_foreach"})
 .set_attr<FCreateOpState>("FCreateOpState", CreateForeachState)
 .set_attr<nnvm::FInferShape>("FInferShape", ForeachShape)
 .set_attr<nnvm::FInferType>("FInferType", ForeachType)
@@ -319,5 +476,20 @@ NNVM_REGISTER_OP(_foreach)
 .add_argument("states", "NDArray-or-Symbol[]", "The list of initial states.")
 .add_arguments(ForeachParam::__FIELDS__());
 
+NNVM_REGISTER_OP(_backward_foreach)
+.set_num_inputs([](const NodeAttrs& attrs){
+  const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
+  return params.num_outputs * 2 + params.num_args - 1;
+  })
+.set_num_outputs([](const NodeAttrs& attrs){
+  const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
+  return params.num_args - 1;
+  })
+.set_attr<FInferStorageType>("FInferStorageType", BackwardForeachStorageType)
+.set_attr_parser(ForeachParamParser)
+.set_attr<bool>("TIsLayerOpBackward", true)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", ForeachGradComputeExCPU);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 0389744003f4..f7c5e9182ae7 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5952,18 +5952,35 @@ def step(in1, states):
     out = mx.sym.Group([out1, out[1][0]])
     arr1 = mx.nd.random.uniform(shape=(5, 2))
     arr2 = mx.nd.random.uniform(shape=(2))
-    e = out.bind(ctx=mx.cpu(), args={'v3': arr1, 'v4': arr2})
-    e.forward()
-    arr1 = arr1.asnumpy()
-    arr2 = arr2.asnumpy()
-    np_res = np.zeros_like(arr1)
-    for i in range(arr1.shape[0]):
-        if (i == 0):
-            np_res[i] = arr2 + arr1[i]
-        else:
-            np_res[i] = np_res[i - 1] + arr1[i]
-    np_res = np_res * 2
-    assert_almost_equal(e.outputs[0].asnumpy(), np_res, rtol=0.001, atol=0.0001)
+    arr_grad1 = mx.nd.empty(arr1.shape)
+    arr_grad2 = mx.nd.empty(arr2.shape)
+    e = out.bind(ctx=mx.cpu(), args={'v3': arr1, 'v4': arr2},
+            args_grad={'v3': arr_grad1, 'v4': arr_grad2})
+    e.forward(is_train=True)
+
+    out_grad = mx.nd.random.uniform(-10, 10, arr1.shape)
+    state_grad = mx.nd.random.uniform(-10, 10, arr2.shape)
+    # backward
+    e.backward([out_grad, state_grad])
+    #e.backward()
+
+    res = []
+    arr1.attach_grad()
+    arr2.attach_grad()
+    with mx.autograd.record():
+        for i in range(arr1.shape[0]):
+            if (i == 0):
+                tmp_res = mx.nd.expand_dims(arr2, 0) + mx.nd.expand_dims(arr1[i], 0)
+            else:
+                tmp_res = res[len(res) - 1] + mx.nd.expand_dims(arr1[i], 0)
+            res.append(tmp_res)
+        res1 = mx.nd.concat(*res, dim=0)
+        res2 = res1 * 2
+        res = mx.nd.concat(res2, tmp_res, dim=0)
+    res.backward(mx.nd.concat(out_grad, mx.nd.expand_dims(state_grad, 0), dim=0))
+    assert_almost_equal(e.outputs[0].asnumpy(), res2.asnumpy(), rtol=0.001, atol=0.0001)
+    assert_almost_equal(arr1.grad.asnumpy(), e.grad_arrays[0].asnumpy())
+    assert_almost_equal(arr2.grad.asnumpy(), e.grad_arrays[1].asnumpy())
 
 
 @with_seed()

From d82dd30a45afe6ce83edfd85148711eeb94ef8c0 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 4 May 2018 16:43:36 +0000
Subject: [PATCH 031/135] Fix bugs.

---
 src/operator/nn/control_flow.cc | 37 ++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index e84b35267cb8..cb5aca4e9ac6 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -91,6 +91,12 @@ struct ForeachState {
   void Backward(int iter_no, std::vector<NDArray> ograds,
                 const std::vector<OpReqType> &req,
                 std::vector<NDArray> igrads);
+  void Cleanup() {
+    all_outputs.clear();
+    all_inputs.clear();
+    all_gradients.clear();
+    iter_ops.clear();
+  }
 };
 
 void ForeachState::Forward(std::vector<NDArray> cinputs,
@@ -157,10 +163,19 @@ void ForeachState::Backward(int iter_no, std::vector<NDArray> ograds,
   outputs.reserve(op->num_inputs());
   for (size_t i = 0; i < ograds.size(); i++)
     inputs.push_back(&ograds[i]);
-//  for (size_t i = 0; i < all_inputs[iter_no].size(); i++)
-//    inputs.push_back(&all_inputs[iter_no][i]);
-//  for (size_t i = 0; i < all_outputs[iter_no].size(); i++)
-//    inputs.push_back(&all_outputs[iter_no][i]);
+
+  const std::vector<bool> &save_inputs = op->save_inputs();
+  const std::vector<bool> &save_outputs = op->save_outputs();
+  CHECK_EQ(save_inputs.size(), all_inputs[iter_no].size());
+  CHECK_EQ(op->num_outputs(), all_outputs[iter_no].size());
+  for (size_t i = 0; i < all_inputs[iter_no].size(); i++) {
+    if (save_inputs[i])
+      inputs.push_back(&all_inputs[iter_no][i]);
+  }
+  for (size_t i = 0; i < all_outputs[iter_no].size(); i++) {
+    if (save_outputs[i])
+      inputs.push_back(&all_outputs[iter_no][i]);
+  }
   CHECK_EQ(inputs.size(), op->num_backward_inputs());
   for (size_t i = 0; i < igrads.size(); i++)
     outputs.push_back(&igrads[i]);
@@ -281,7 +296,11 @@ static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
   for (int iter_num = len - 1; iter_num >= 0; iter_num--) {
     ograds[0] = inputs[0].At(iter_num);
     igrads[0] = outputs[0].At(iter_num);
-    if (iter_num == 0) {
+    // There are three types of arrays in igrads.
+    // * data gradients.
+    // * loop variable gradients.
+    // * read-only variable gradients.
+    if (iter_num != 0) {
       for (size_t i = num_input_data; i < igrads.size(); i++)
         igrads[i] = NDArray(outputs[i].shape(), outputs[i].ctx(),
                             true, outputs[i].dtype());
@@ -300,9 +319,13 @@ static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
       igrads[i].WaitToRead();
 
     size_t num_states = ograds.size() - num_output_data;
-    for (size_t i = 0; i < num_states; i++)
-      ograds[i + num_output_data] = igrads[i + num_input_data];
+    for (size_t i = 0; i < num_states; i++) {
+      size_t loc = params.in_state_locs[i];
+      CHECK_LT(loc, igrads.size());
+      ograds[i + num_output_data] = igrads[loc];
+    }
   }
+  state.Cleanup();
 }
 
 static bool ForeachShape(const nnvm::NodeAttrs& attrs,

From 868c9f28575b0eb6cd30890d45502c9eb531fbf4 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 4 May 2018 16:44:28 +0000
Subject: [PATCH 032/135] enable backward test in lstm.

---
 tests/python/unittest/test_operator.py | 55 ++++++++++++++++++++------
 1 file changed, 42 insertions(+), 13 deletions(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index f7c5e9182ae7..a1df3e7fa54f 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5957,12 +5957,10 @@ def step(in1, states):
     e = out.bind(ctx=mx.cpu(), args={'v3': arr1, 'v4': arr2},
             args_grad={'v3': arr_grad1, 'v4': arr_grad2})
     e.forward(is_train=True)
-
+    # backward
     out_grad = mx.nd.random.uniform(-10, 10, arr1.shape)
     state_grad = mx.nd.random.uniform(-10, 10, arr2.shape)
-    # backward
     e.backward([out_grad, state_grad])
-    #e.backward()
 
     res = []
     arr1.attach_grad()
@@ -6021,13 +6019,37 @@ def sym_group(out):
     i2h_barr = mx.nd.random.uniform(shape=(16))
     h2h_barr = mx.nd.random.uniform(shape=(16))
 
+    data_arr_grad1 = mx.nd.empty(data_arr.shape)
+    h_arr_grad1 = mx.nd.empty(h_arr.shape)
+    c_arr_grad1 = mx.nd.empty(c_arr.shape)
+    i2h_warr_grad1 = mx.nd.empty(i2h_warr.shape)
+    h2h_warr_grad1 = mx.nd.empty(h2h_warr.shape)
+    i2h_barr_grad1 = mx.nd.empty(i2h_barr.shape)
+    h2h_barr_grad1 = mx.nd.empty(h2h_barr.shape)
     out = mx.sym.contrib.foreach(step, data, [init_h, init_c])
     out = sym_group(out)
-    e = out.bind(ctx=mx.cpu(), args={'data': data_arr, 'h': h_arr, 'c': c_arr,
-        'i2h_weight': i2h_warr, 'h2h_weight': h2h_warr, 'i2h_bias': i2h_barr, 'h2h_bias': h2h_barr})
-    e.forward()
-    outputs1 = e.outputs
-
+    e1 = out.bind(ctx=mx.cpu(),
+                  args={'data': data_arr, 'h': h_arr, 'c': c_arr,
+                        'i2h_weight': i2h_warr, 'h2h_weight': h2h_warr,
+                        'i2h_bias': i2h_barr, 'h2h_bias': h2h_barr},
+                  args_grad={'data': data_arr_grad1, 'h': h_arr_grad1, 'c': c_arr_grad1,
+                             'i2h_weight': i2h_warr_grad1, 'h2h_weight': h2h_warr_grad1,
+                             'i2h_bias': i2h_barr_grad1, 'h2h_bias': h2h_barr_grad1})
+    e1.forward(is_train=True)
+    outputs1 = e1.outputs
+    # backward
+    out_grads = []
+    for arr in e1.outputs:
+        out_grads.append(mx.nd.random.uniform(-10, 10, arr.shape))
+    e1.backward(out_grads)
+
+    data_arr_grad2 = mx.nd.empty(data_arr.shape)
+    h_arr_grad2 = mx.nd.empty(h_arr.shape)
+    c_arr_grad2 = mx.nd.empty(c_arr.shape)
+    i2h_warr_grad2 = mx.nd.empty(i2h_warr.shape)
+    h2h_warr_grad2 = mx.nd.empty(h2h_warr.shape)
+    i2h_barr_grad2 = mx.nd.empty(i2h_barr.shape)
+    h2h_barr_grad2 = mx.nd.empty(h2h_barr.shape)
     lstm = mx.rnn.LSTMCell(4, prefix='mylstm_')
     h = init_h
     c = init_c
@@ -6037,14 +6059,21 @@ def sym_group(out):
         unroll_outs.append(mx.sym.expand_dims(h, axis=0))
     unroll_outs = mx.sym.concat(*unroll_outs, dim=0)
     out = mx.sym.Group([unroll_outs, h, c])
-    e = out.bind(ctx=mx.cpu(), args={'data': data_arr, 'h': h_arr, 'c': c_arr,
-        'mylstm_i2h_weight': i2h_warr, 'mylstm_h2h_weight': h2h_warr,
-        'mylstm_i2h_bias': i2h_barr, 'mylstm_h2h_bias': h2h_barr})
-    e.forward()
-    outputs2 = e.outputs
+    e2 = out.bind(ctx=mx.cpu(),
+                  args={'data': data_arr, 'h': h_arr, 'c': c_arr,
+                        'mylstm_i2h_weight': i2h_warr, 'mylstm_h2h_weight': h2h_warr,
+                        'mylstm_i2h_bias': i2h_barr, 'mylstm_h2h_bias': h2h_barr},
+                  args_grad={'data': data_arr_grad2, 'h': h_arr_grad2, 'c': c_arr_grad2,
+                             'mylstm_i2h_weight': i2h_warr_grad2, 'mylstm_h2h_weight': h2h_warr_grad2,
+                             'mylstm_i2h_bias': i2h_barr_grad2, 'mylstm_h2h_bias': h2h_barr_grad2})
+    e2.forward(is_train=True)
+    outputs2 = e2.outputs
+    e2.backward(out_grads)
 
     for i in range(len(outputs2)):
         assert_almost_equal(outputs1[i].asnumpy(), outputs2[i].asnumpy(), rtol=0.001, atol=0.0001)
+    for i in range(len(e1.grad_arrays)):
+        assert_almost_equal(e1.grad_arrays[i].asnumpy(), e2.grad_arrays[i].asnumpy())
 
 
 @with_seed()

From 84e1877ff1e2e1c9397870645fdec5396515283a Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Mon, 7 May 2018 17:28:46 +0000
Subject: [PATCH 033/135] Fix a bug in foreach backward for free variables.

---
 src/operator/nn/control_flow.cc        | 45 ++++++++++++++++++--------
 tests/python/unittest/test_operator.py | 25 ++++++++++----
 2 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index cb5aca4e9ac6..4a31af53a290 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -284,39 +284,58 @@ static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
   const ForeachParam& params = state.params;
   CHECK_EQ(outputs.size(), (size_t) params.num_args - 1);
   // The inputs contain out gradients, inputs and outputs.
-  size_t len = inputs[0].shape()[0];
+  int len = inputs[0].shape()[0];
   size_t num_input_data = 1;
   size_t num_output_data = 1;
 
   // In backward computation, we need to run iterations from backwards.
   std::vector<NDArray> ograds(params.num_outputs);
-  std::vector<NDArray> igrads(params.num_args - 1);
+  std::vector<NDArray> igrads(outputs.size());
   for (size_t i = num_output_data; i < ograds.size(); i++)
     ograds[i] = inputs[i];
+  std::vector<OpReqType> iter_req(req.size());
+  for (auto r : req)
+    CHECK_NE(r, kWriteInplace);
   for (int iter_num = len - 1; iter_num >= 0; iter_num--) {
+    // TODO data isn't always the first one.
     ograds[0] = inputs[0].At(iter_num);
     igrads[0] = outputs[0].At(iter_num);
-    // There are three types of arrays in igrads.
-    // * data gradients.
-    // * loop variable gradients.
-    // * read-only variable gradients.
-    if (iter_num != 0) {
-      for (size_t i = num_input_data; i < igrads.size(); i++)
+    iter_req[0] = req[0];
+    for (size_t i = num_input_data; i < igrads.size(); i++) {
+      // There are three types of arrays in igrads.
+      // * data gradients.
+      // * loop variable gradients.
+      // * read-only variable gradients.
+      // For state gradients, we need to allocate new NDArrays
+      // because intermediate state gradients won't be returned to the users.
+      bool in_state = std::find(params.in_state_locs.begin(),
+                                params.in_state_locs.end(),
+                                i) != params.in_state_locs.end();
+      if (iter_num != 0 && in_state) {
         igrads[i] = NDArray(outputs[i].shape(), outputs[i].ctx(),
                             true, outputs[i].dtype());
-    } else {
-      for (size_t i = num_input_data; i < igrads.size(); i++)
+      } else {
         igrads[i] = outputs[i];
+      }
+      if (in_state)
+        // For the first iteration, we need to use the request provided by
+        // the user to write state gradients to the outputs.
+        iter_req[i] = iter_num != 0 ? kWriteTo : req[i];
+      else
+        // For all read-only variable gradients, we need to use the request
+        // provided by the user in the last iteration and later on add gradients
+        // to the output arrays.
+        iter_req[i] = iter_num == len - 1 ? req[i]: kAddTo;
     }
 
-    // TODO is req correct here?
-    state.Backward(iter_num, ograds, req, igrads);
+    state.Backward(iter_num, ograds, iter_req, igrads);
 
     // We need to wait for the iteration to complete before executing
     // the next one or return from the loop. In this way, we can reuse
     // the memory in the subgraph.
-    for (size_t i = 0; i < igrads.size(); i++)
+    for (size_t i = 0; i < igrads.size(); i++) {
       igrads[i].WaitToRead();
+    }
 
     size_t num_states = ograds.size() - num_output_data;
     for (size_t i = 0; i < num_states; i++) {
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index a1df3e7fa54f..1ebe8341b16e 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5941,21 +5941,24 @@ def test_float16_min_max():
 def test_foreach():
     v3 = mx.sym.var("v3")
     v4 = mx.sym.var("v4")
+    v5 = mx.sym.var("v5")
 
     # This tests foreach with accumulation sum.
     def step(in1, states):
-        out = in1 + states[0]
+        out = in1 + states[0] + v5
         return (out, [out])
 
     out = mx.sym.contrib.foreach(step, v3, [v4])
     out1 = out[0] * 2
     out = mx.sym.Group([out1, out[1][0]])
-    arr1 = mx.nd.random.uniform(shape=(5, 2))
+    arr1 = mx.nd.random.uniform(shape=(2, 2))
     arr2 = mx.nd.random.uniform(shape=(2))
+    arr3 = mx.nd.random.uniform(shape=(2))
     arr_grad1 = mx.nd.empty(arr1.shape)
     arr_grad2 = mx.nd.empty(arr2.shape)
-    e = out.bind(ctx=mx.cpu(), args={'v3': arr1, 'v4': arr2},
-            args_grad={'v3': arr_grad1, 'v4': arr_grad2})
+    arr_grad3 = mx.nd.empty(arr3.shape)
+    e = out.bind(ctx=mx.cpu(), args={'v3': arr1, 'v4': arr2, 'v5': arr3},
+            args_grad={'v3': arr_grad1, 'v4': arr_grad2, 'v5': arr_grad3})
     e.forward(is_train=True)
     # backward
     out_grad = mx.nd.random.uniform(-10, 10, arr1.shape)
@@ -5965,12 +5968,13 @@ def step(in1, states):
     res = []
     arr1.attach_grad()
     arr2.attach_grad()
+    arr3.attach_grad()
     with mx.autograd.record():
         for i in range(arr1.shape[0]):
             if (i == 0):
-                tmp_res = mx.nd.expand_dims(arr2, 0) + mx.nd.expand_dims(arr1[i], 0)
+                tmp_res = mx.nd.expand_dims(arr2, 0) + mx.nd.expand_dims(arr1[i], 0) + mx.nd.expand_dims(arr3, 0)
             else:
-                tmp_res = res[len(res) - 1] + mx.nd.expand_dims(arr1[i], 0)
+                tmp_res = res[len(res) - 1] + mx.nd.expand_dims(arr1[i], 0) + mx.nd.expand_dims(arr3, 0)
             res.append(tmp_res)
         res1 = mx.nd.concat(*res, dim=0)
         res2 = res1 * 2
@@ -5979,6 +5983,7 @@ def step(in1, states):
     assert_almost_equal(e.outputs[0].asnumpy(), res2.asnumpy(), rtol=0.001, atol=0.0001)
     assert_almost_equal(arr1.grad.asnumpy(), e.grad_arrays[0].asnumpy())
     assert_almost_equal(arr2.grad.asnumpy(), e.grad_arrays[1].asnumpy())
+    assert_almost_equal(arr3.grad.asnumpy(), e.grad_arrays[2].asnumpy())
 
 
 @with_seed()
@@ -6011,7 +6016,7 @@ def sym_group(out):
         ret.extend(out[1])
         return mx.sym.Group(ret)
 
-    data_arr = mx.nd.random.uniform(shape=(5, 2, 4))
+    data_arr = mx.nd.random.uniform(shape=(2, 2, 4))
     h_arr = mx.nd.random.uniform(shape=(2, 4))
     c_arr = mx.nd.random.uniform(shape=(2, 4))
     i2h_warr = mx.nd.random.uniform(shape=(16, 4))
@@ -6076,6 +6081,12 @@ def sym_group(out):
         assert_almost_equal(e1.grad_arrays[i].asnumpy(), e2.grad_arrays[i].asnumpy())
 
 
+# TODO Test cases:
+# in an iteration, data is stored in any location.
+# # iterations: odd or even.
+# multiple inputs and multiple outputs.
+
+
 @with_seed()
 def test_squeeze_op():
     def check_squeeze_op(shape, axis=None):

From 00b8b1c90df403575669707640b33a5827460442 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 9 May 2018 00:31:33 +0000
Subject: [PATCH 034/135] change for the new CachedOp.

---
 src/operator/nn/control_flow.cc | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 4a31af53a290..8d5ede3391b9 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -71,7 +71,8 @@ static std::vector<T> ReorderInputs(const std::vector<T> &in, const nnvm::Indexe
 }
 
 struct ForeachState {
-  Symbol subgraph;
+  Symbol subgraph_sym;
+  nnvm::Graph subgraph;
   ForeachParam params;
   // These are output arrays from all iterations.
   // They also contain the Op state for each CachedOp.
@@ -81,7 +82,8 @@ struct ForeachState {
   std::vector<CachedOpPtr> iter_ops;
 
   ForeachState(const Symbol &g, const ForeachParam &params) {
-    this->subgraph = g;
+    this->subgraph_sym = g;
+    this->subgraph.outputs = g.outputs;
     this->params = params;
   }
 
@@ -136,7 +138,15 @@ void ForeachState::Forward(std::vector<NDArray> cinputs,
 
   std::vector<std::pair<std::string, std::string> > kwargs;
   kwargs.push_back(std::pair<std::string, std::string>("inline_limit", "0"));
-  CachedOpPtr op = std::make_shared<Imperative::CachedOp>(subgraph, kwargs);
+  // Get input names.
+  const auto& idx = subgraph.indexed_graph();
+  std::vector<std::string> arg_names(idx.input_nodes().size());
+  for (size_t i = 0; i < idx.input_nodes().size(); ++i)
+    arg_names[i] = idx[idx.input_nodes()[i]].source->attrs.name;
+  // We don't have parameters for the cached op.
+  std::unordered_map<std::string, std::vector<NDArray> > params;
+  CachedOpPtr op = std::make_shared<Imperative::CachedOp>(subgraph_sym, kwargs,
+                                                          arg_names, params);
   // TODO here we only changed the output arrays in the arguments.
   // Will this be a problem?
   op->Forward(nullptr, inputs, outputs);

From f2e324e9f1f3923b60bd14d482ac3ec36909e690 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 9 May 2018 03:33:08 +0000
Subject: [PATCH 035/135] Detect the backward computation.

---
 include/mxnet/op_attr_types.h     |  4 +++-
 src/executor/graph_executor.cc    |  9 ++++++---
 src/executor/graph_executor.h     |  2 ++
 src/imperative/imperative_utils.h | 12 ++++++++----
 src/ndarray/ndarray.cc            |  3 ++-
 src/operator/nn/control_flow.cc   | 20 +++++++++++---------
 6 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h
index f4694efad297..b78cbf518feb 100644
--- a/include/mxnet/op_attr_types.h
+++ b/include/mxnet/op_attr_types.h
@@ -64,8 +64,10 @@ enum OpReqType {
  * \sa Resource
  */
 struct OpContext {
+  /*! \brief whether there is a backward phase to compute gradients. */
+  bool need_grad;
   /*! \brief whether it is training phase */
-  int is_train;
+  bool is_train;
   /*! \brief RunContext related resources */
   RunContext run_ctx;
   /*! \brief the callback when operation completes, used by asynchronize ops */
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 399581697b1d..4478e0bb44b2 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -39,6 +39,7 @@ namespace exec {
 
 GraphExecutor::GraphExecutor() {
   log_verbose_ = dmlc::GetEnv("MXNET_EXEC_VERBOSE_LOGGING", false);
+  need_grad_ = false;
 }
 
 GraphExecutor::~GraphExecutor() {
@@ -257,11 +258,11 @@ nnvm::Graph GraphExecutor::InitFullGraph(nnvm::Symbol symbol,
 
   nnvm::Graph g;
   g.outputs = symbol.outputs;
-  bool need_grad = false;
+  need_grad_ = false;
   for (OpReqType req : grad_req_types) {
-    if (req != kNullOp) need_grad = true;
+    if (req != kNullOp) need_grad_ = true;
   }
-  if (!need_grad) return g;
+  if (!need_grad_) return g;
   for (size_t i = 0; i < g.outputs.size(); ++i) {
     NodeEntry ngrad{nnvm::Node::Create(), 0, 0};
     head_grad_entry_.emplace_back(AttrHint(ngrad, g.outputs[i]));
@@ -1603,6 +1604,7 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
     const auto& inode = idx[nid];
     if (inode.source->is_variable()) continue;
     opnode.exec->op_ctx.is_train = is_train;
+    opnode.exec->op_ctx.need_grad = need_grad_;
   }
 
   // Push Ops
@@ -1621,6 +1623,7 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
     OpNode& opnode = op_nodes_[nid];
     if (op_nodes_[nid].skip_exec_node) continue;
     opnode.exec->op_ctx.is_train = is_train;
+    opnode.exec->op_ctx.need_grad = need_grad_;
     if (opnode.exec->exec_type() == ExecType::kCrossDeviceCopy) {
       CHECK_EQ(inode.inputs.size(), 1U);
       CHECK_EQ(opnode.exec->in_array.size(), 1U);
diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h
index 24f98894912b..bfc415b4526a 100644
--- a/src/executor/graph_executor.h
+++ b/src/executor/graph_executor.h
@@ -213,6 +213,8 @@ class GraphExecutor : public Executor {
   // perform bulking and segmentation on a training graph
   void BulkTrainingOpSegs(size_t total_num_nodes);
 
+  // indicate whether there is a backward graph for gradients.
+  bool need_grad_;
   // internal graph
   nnvm::Graph graph_;
   // operator node
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index faff5f173fe1..691f0822b082 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -393,7 +393,8 @@ inline void PushFCompute(const FCompute& fn,
                              &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst,
                              &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx);
       // setup context
-      OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested};
+      bool need_grad = Imperative::Get()->is_recording();
+      OpContext opctx{need_grad, is_train, rctx, engine::CallbackOnComplete(), requested};
       bool is_gpu = ctx.dev_mask() == gpu::kDevMask;
       // pre-fcompute fallback, cast to default storage type
       CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx, is_gpu);
@@ -424,7 +425,8 @@ inline void PushFComputeEx(const FComputeEx& fn,
   std::vector<NDArray> inputs, outputs;
   DerefInputOutput(p_inputs, p_outputs, &inputs, &outputs);
   const auto& run = [=](RunContext rctx) {
-      OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested};
+      bool need_grad = Imperative::Get()->is_recording();
+      OpContext opctx{need_grad, is_train, rctx, engine::CallbackOnComplete(), requested};
 #if MXNET_USE_MKLDNN == 1
       InvalidateOutputs(outputs, req);
 #endif
@@ -470,7 +472,8 @@ inline void PushOperator(const OpStatePtr& state,
   if (fcompute_ex != nullptr && dispatch_mode == DispatchMode::kFComputeEx) {
     const auto& run = [=](RunContext rctx,
                           engine::CallbackOnComplete on_complete) {
-      OpContext opctx{is_train, rctx, on_complete, requested};
+      bool need_grad = Imperative::Get()->is_recording();
+      OpContext opctx{need_grad, is_train, rctx, on_complete, requested};
 #if MXNET_USE_MKLDNN == 1
       InvalidateOutputs(outputs, req);
 #endif
@@ -497,7 +500,8 @@ inline void PushOperator(const OpStatePtr& state,
         << "for stateful operator " << op->name;
 
     const auto& run = [=](RunContext rctx, engine::CallbackOnComplete on_complete) {
-        OpContext opctx{is_train, rctx, on_complete, requested};
+        bool need_grad = Imperative::Get()->is_recording();
+        OpContext opctx{need_grad, is_train, rctx, on_complete, requested};
 
         std::vector<TBlob> input_blobs, output_blobs;
         // pre-fcompute and post-fcompute storage fallback src NDArrays and dst NDArrays
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index e90fb6319d77..0b2beed3391b 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -1109,7 +1109,8 @@ void CopyFromToImpl(const NDArray& from, const NDArray& to,
   const Context to_ctx = to.ctx();
   bool is_train = Imperative::Get()->is_training();
 
-  OpContext opctx{is_train,
+  OpContext opctx{Imperative::Get()->is_recording(),
+                  is_train,
                   rctx,
                   engine::CallbackOnComplete(),
                   requested};
diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 8d5ede3391b9..0a3d2e1a78b4 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -70,10 +70,7 @@ static std::vector<T> ReorderInputs(const std::vector<T> &in, const nnvm::Indexe
   return ret;
 }
 
-struct ForeachState {
-  Symbol subgraph_sym;
-  nnvm::Graph subgraph;
-  ForeachParam params;
+class ForeachState {
   // These are output arrays from all iterations.
   // They also contain the Op state for each CachedOp.
   std::vector<std::vector<NDArray> > all_outputs;
@@ -81,6 +78,11 @@ struct ForeachState {
   std::vector<std::vector<NDArray> > all_gradients;
   std::vector<CachedOpPtr> iter_ops;
 
+ public:
+  Symbol subgraph_sym;
+  nnvm::Graph subgraph;
+  ForeachParam params;
+
   ForeachState(const Symbol &g, const ForeachParam &params) {
     this->subgraph_sym = g;
     this->subgraph.outputs = g.outputs;
@@ -166,6 +168,8 @@ void ForeachState::Backward(int iter_no, std::vector<NDArray> ograds,
   using namespace nnvm;
   using namespace imperative;
 
+  CHECK_GT(iter_ops.size(), iter_no)
+      << "We didn't record the computation for iteration " << iter_no;
   auto op = iter_ops[iter_no];
   std::vector<NDArray *> inputs;
   std::vector<NDArray *> outputs;
@@ -199,8 +203,6 @@ void ForeachState::Backward(int iter_no, std::vector<NDArray> ograds,
   op->Backward(false, state, inputs, req, outputs);
 }
 
-static bool is_recording = true;
-
 static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
                                 const OpContext& ctx,
                                 const std::vector<NDArray>& inputs,
@@ -253,11 +255,11 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
     (*subg_out_curr)[0] = outputs[0].At(i);
     // When recording for backward computation, we should make sure 
     // that output arrays are actually different in each iteration.
-    if (is_recording && i < len - 1) {
+    if (ctx.need_grad && i < len - 1) {
       for (size_t j = 1; j < subg_out_curr->size(); j++)
         (*subg_out_curr)[j] = NDArray(outputs[j].shape(), outputs[j].ctx(),
                                       true, outputs[j].dtype());
-    } else if (is_recording && i == len - 1) {
+    } else if (ctx.need_grad && i == len - 1) {
       // For the last iteration, we need to write data to the output array
       // directly.
       for (size_t j = 1; j < subg_out_curr->size(); j++)
@@ -276,7 +278,7 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
       }
     }
 
-    state.Forward(subg_inputs, req, *subg_out_curr, is_recording);
+    state.Forward(subg_inputs, req, *subg_out_curr, ctx.need_grad);
     // We need to wait for the iteration to complete before executing
     // the next one or return from the loop. In this way, we can reuse
     // the memory in the subgraph.

From e1322d1321521b5fec185941957b11e4d891e0cc Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 9 May 2018 23:26:34 +0000
Subject: [PATCH 036/135] Fix bugs in foreach.

---
 python/mxnet/symbol/contrib.py  |  32 ++++---
 src/operator/nn/control_flow.cc | 152 ++++++++++++++++++++------------
 2 files changed, 116 insertions(+), 68 deletions(-)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 856591ca0c09..1d82cdc92763 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -30,7 +30,7 @@
 
 from . import symbol
 from ..base import _LIB, c_str, c_array, check_call
-from ..base import SymbolHandle, NDArrayHandle
+from ..base import SymbolHandle, NDArrayHandle, _as_list
 from ..attribute import AttrScope
 
 __all__ = ["rand_zipfian"]
@@ -115,11 +115,14 @@ def foreach(func, input, init_states, back_prop=False, name="foreach"):
     assert isinstance(init_states, list), "init_states should be a list"
     states = []
     with AttrScope(subgraph_name=name):
-        in_ele = symbol.var("in")
+        if isinstance(input, list):
+            in_eles = [symbol.var(sym.name) for sym in input]
+        else:
+            in_eles = symbol.var(input.name)
         for s in init_states:
             states.append(symbol.var(s.name))
 
-        sym_out = func(in_ele, states)
+        sym_out = func(in_eles, states)
         # The function should return a tuple. The first element goes to
         # the output of the function. The second element is a list.
         assert isinstance(sym_out, tuple), "func should return a tuple (out, states)"
@@ -133,6 +136,7 @@ def foreach(func, input, init_states, back_prop=False, name="foreach"):
             flat_out = sym_out[0]
         else:
             flat_out = [sym_out[0]]
+        num_out_data = len(flat_out)
         for s in sym_out[1]:
             # There is a problem if the outputs are the same as the inputs
             # or the first output.
@@ -153,26 +157,32 @@ def foreach(func, input, init_states, back_prop=False, name="foreach"):
     input_syms = {sym.name:sym for sym in input_syms}
     gin_names = input_syms.keys()
     # This array contains the symbols for the inputs of foreach.
+    # They are ordered according to the inputs of the subgraph.
     ordered_ins = []
     states_map = {sym.name:sym for sym in init_states}
     state_names = states_map.keys()
-    in_state_locs = [-1] * len(init_states)
+    data_syms = _as_list(input)
+    data_map = {sym.name:sym for sym in data_syms}
+    data_names = data_map.keys()
+    in_state_locs = []
+    in_data_locs = []
     for in_name in g.list_inputs():
         assert in_name in gin_names, "The input variable %s can't be found in graph inputs: %s" \
                 % (in_name, str(gin_names))
         if (in_name in state_names):
             ordered_ins.append(states_map[in_name])
-        elif (in_name != "in"):
+            in_state_locs.append(len(ordered_ins) - 1)
+        elif (in_name in data_names):
+            ordered_ins.append(data_map[in_name])
+            in_data_locs.append(len(ordered_ins) - 1)
+        else:
             ordered_ins.append(input_syms[in_name])
 
-        for i in range(len(init_states)):
-            if (init_states[i].name == in_name):
-                in_state_locs[i] = len(ordered_ins) - 1 + num_inputs
-
     num_outputs = len(flat_out)
     num_states = len(state_names)
-    ret = symbol._internal._foreach(g, input, *ordered_ins, num_outputs=num_outputs,
-                                    in_state_locs=in_state_locs)
+    ret = symbol._internal._foreach(g, *ordered_ins, num_outputs=num_outputs,
+                                    num_out_data=num_out_data, in_state_locs=in_state_locs,
+                                    in_data_locs=in_data_locs)
     if (num_outputs - num_states > 1):
         outs = []
         for i in range(num_outputs - num_states):
diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 0a3d2e1a78b4..6fe3675dfa5d 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -35,7 +35,9 @@ struct ForeachParam : public dmlc::Parameter<ForeachParam> {
   int num_args;
   int dim;
   int num_outputs;
+  int num_out_data;
   nnvm::Tuple<dim_t> in_state_locs;
+  nnvm::Tuple<dim_t> in_data_locs;
   DMLC_DECLARE_PARAMETER(ForeachParam) {
     DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
     .describe("Number of inputs.");
@@ -43,8 +45,12 @@ struct ForeachParam : public dmlc::Parameter<ForeachParam> {
     .describe("the dimension of the input array to iterate.");
     DMLC_DECLARE_FIELD(num_outputs)
     .describe("The number of outputs of the subgraph.");
+    DMLC_DECLARE_FIELD(num_out_data)
+    .describe("The number of output data of the subgraph.");
     DMLC_DECLARE_FIELD(in_state_locs)
     .describe("The locations of loop states among the inputs.");
+    DMLC_DECLARE_FIELD(in_data_locs)
+    .describe("The locations of input data among the inputs.");
   }
 };  // struct ForeachParam
 
@@ -210,16 +216,17 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
                                 const std::vector<NDArray>& outputs) {
   ForeachState &state = state_ptr.get_state<ForeachState>();
   const ForeachParam& params = state.params;
+  size_t iter_dim = 0;
   CHECK_EQ(outputs.size(), (size_t) params.num_outputs);
-  size_t len = inputs[0].shape()[0];
-  CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
-
-  // Initialize the inputs for the subgraph.
-  std::vector<NDArray> subg_inputs(inputs.size());
-  for (size_t i = 1; i < inputs.size(); i++) {
-    // These are the initial states.
-    subg_inputs[i] = inputs[i];
+  CHECK_GT(params.in_data_locs.ndim(), 0);
+  size_t loc0 = params.in_data_locs[0];
+  size_t len = inputs[loc0].shape()[iter_dim];
+  for (size_t i = 1; i < params.in_data_locs.ndim(); i++) {
+    size_t loc = params.in_data_locs[i];
+    CHECK_EQ(inputs[loc].shape()[iter_dim], len);
   }
+  for (size_t i = 0; i < (size_t) params.num_out_data; i++)
+    CHECK_EQ(len, outputs[i].shape()[iter_dim]);
 
   // Initialize the outputs of the subgraph is a little trickier.
   // The states from the previous iteration are used as the inputs of the next
@@ -246,35 +253,49 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
     }
   }
 
+  // Initialize the inputs for the subgraph.
+  // In each iteration, we need to update the subgraph inputs for input data
+  // and the loop states. This initialization helps to get the read-only
+  // arrays in the loop.
+  std::vector<NDArray> subg_inputs(inputs.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    // These are the initial states.
+    subg_inputs[i] = inputs[i];
+  }
+
   // Here we iterate over the first dimension of the first input array.
   for (size_t i = 0; i < len; i++) {
+    // Initialize outputs for the subgraph.
     std::vector<NDArray> *subg_out_curr = subg_outputs[i % 2];
     std::vector<NDArray> *subg_out_prev = subg_outputs[(i + 1) % 2];
-    // TODO it might be possible that the data won't be written to the output
-    // array directly.
-    (*subg_out_curr)[0] = outputs[0].At(i);
+    for (int j = 0; j < params.num_out_data; j++)
+      (*subg_out_curr)[j] = outputs[j].At(i);
     // When recording for backward computation, we should make sure 
     // that output arrays are actually different in each iteration.
     if (ctx.need_grad && i < len - 1) {
-      for (size_t j = 1; j < subg_out_curr->size(); j++)
+      for (size_t j = params.num_out_data; j < subg_out_curr->size(); j++)
         (*subg_out_curr)[j] = NDArray(outputs[j].shape(), outputs[j].ctx(),
                                       true, outputs[j].dtype());
     } else if (ctx.need_grad && i == len - 1) {
       // For the last iteration, we need to write data to the output array
       // directly.
-      for (size_t j = 1; j < subg_out_curr->size(); j++)
+      for (size_t j = params.num_out_data; j < subg_out_curr->size(); j++)
         (*subg_out_curr)[j] = outputs[j];
     }
 
-    // Get a slice from the first input array.
-    // TODO how can we be sure that the first subgraph input is the data input?
-    subg_inputs[0] = inputs[0].At(i);
+    // Initialize inputs for the subgraph.
+    // Get a slice from the input data arrays.
+    for (size_t j = 0; j < params.in_data_locs.ndim(); j++) {
+      size_t loc = params.in_data_locs[j];
+      subg_inputs[loc] = inputs[loc].At(i);
+    }
     // For the rest of the iterations, the rest of the arguments are the outputs
     // from the previous iteration.
     if (i > 0) {
-      for (size_t j = 1; j < subg_out_prev->size(); j++) {
-        CHECK_LT(params.in_state_locs[j - 1], subg_inputs.size());
-        subg_inputs[params.in_state_locs[j - 1]] = (*subg_out_prev)[j];
+      for (size_t j = params.num_out_data; j < subg_out_prev->size(); j++) {
+        size_t idx = j - params.num_out_data;
+        CHECK_LT(params.in_state_locs[idx], subg_inputs.size());
+        subg_inputs[params.in_state_locs[idx]] = (*subg_out_prev)[j];
       }
     }
 
@@ -282,8 +303,9 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
     // We need to wait for the iteration to complete before executing
     // the next one or return from the loop. In this way, we can reuse
     // the memory in the subgraph.
-    for (size_t j = 0; j < subg_out_curr->size(); j++)
+    for (size_t j = 0; j < subg_out_curr->size(); j++) {
       (*subg_out_curr)[j].WaitToRead();
+    }
   }
 }
 
@@ -295,10 +317,15 @@ static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
   ForeachState &state = state_ptr.get_state<ForeachState>();
   const ForeachParam& params = state.params;
   CHECK_EQ(outputs.size(), (size_t) params.num_args - 1);
+  CHECK_GT(params.in_data_locs.ndim(), 0);
+  size_t iter_dim = 0;
+  std::unordered_set<size_t> in_data_locs(params.in_data_locs.begin(),
+                                          params.in_data_locs.end());
+  std::unordered_set<size_t> in_state_locs(params.in_state_locs.begin(),
+                                           params.in_state_locs.end());
   // The inputs contain out gradients, inputs and outputs.
-  int len = inputs[0].shape()[0];
-  size_t num_input_data = 1;
-  size_t num_output_data = 1;
+  size_t len = inputs[0].shape()[iter_dim];
+  size_t num_output_data = params.num_out_data;
 
   // In backward computation, we need to run iterations from backwards.
   std::vector<NDArray> ograds(params.num_outputs);
@@ -309,21 +336,26 @@ static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
   for (auto r : req)
     CHECK_NE(r, kWriteInplace);
   for (int iter_num = len - 1; iter_num >= 0; iter_num--) {
-    // TODO data isn't always the first one.
-    ograds[0] = inputs[0].At(iter_num);
-    igrads[0] = outputs[0].At(iter_num);
-    iter_req[0] = req[0];
-    for (size_t i = num_input_data; i < igrads.size(); i++) {
-      // There are three types of arrays in igrads.
-      // * data gradients.
-      // * loop variable gradients.
-      // * read-only variable gradients.
-      // For state gradients, we need to allocate new NDArrays
-      // because intermediate state gradients won't be returned to the users.
-      bool in_state = std::find(params.in_state_locs.begin(),
-                                params.in_state_locs.end(),
-                                i) != params.in_state_locs.end();
+    for (int i = 0; i < params.num_out_data; i++)
+      ograds[i] = inputs[i].At(iter_num);
+
+    // There are three types of arrays in igrads.
+    // * data gradients.
+    // * loop variable gradients.
+    // * read-only variable gradients.
+    // These are the input data gradients.
+    for (size_t i = 0; i < igrads.size(); i++) {
+      // data gradients.
+      if (in_data_locs.count(i)) {
+        igrads[i] = outputs[i].At(iter_num);
+        iter_req[i] = req[i];
+        continue;
+      }
+
+      bool in_state = in_state_locs.count(i);
       if (iter_num != 0 && in_state) {
+        // For state gradients, we need to allocate new NDArrays
+        // because intermediate state gradients won't be returned to the users.
         igrads[i] = NDArray(outputs[i].shape(), outputs[i].ctx(),
                             true, outputs[i].dtype());
       } else {
@@ -366,7 +398,13 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_shape->size(), (size_t) params.num_outputs);
   nnvm::ShapeVector shape_inputs = *in_shape;
   // foreach iterates over the first input NDArray over the first dimension.
-  shape_inputs[0] = TShape(in_shape->at(0).begin() + 1, in_shape->at(0).end());
+  size_t loc0 = params.in_data_locs[0];
+  size_t len = in_shape->at(loc0)[0];
+  for (size_t i = 0; i < params.in_data_locs.ndim(); i++) {
+    size_t loc = params.in_data_locs[i];
+    CHECK_EQ(len, in_shape->at(loc)[0]);
+    shape_inputs[loc] = TShape(in_shape->at(loc).begin() + 1, in_shape->at(loc).end());
+  }
   CHECK_EQ(attrs.subgraphs.size(), 1U);
   auto g = std::make_shared<nnvm::Graph>();
   g->outputs = attrs.subgraphs[0]->outputs;
@@ -381,24 +419,26 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
   // We need to copy the inferred input shapes back.
   const auto &input_nids = idx.input_nodes();
   CHECK_EQ(input_nids.size(), in_shape->size());
-  size_t num_input_arrays = 1;
-  for (size_t i = num_input_arrays; i < in_shape->size(); i++) {
+  for (size_t i = 0; i < in_shape->size(); i++) {
     auto eid = idx.entry_id(input_nids[i], 0);
-    (*in_shape)[i] = shapes[eid];
+    // If the input shape is none, we should update them.
+    if ((*in_shape)[i].ndim() == 0 || (*in_shape)[i].Size() == 0)
+      (*in_shape)[i] = shapes[eid];
   }
 
-  // For the first shape.
-  uint32_t eid = idx.entry_id(g->outputs[0]);
-  const auto& g_out_shape = shapes[eid];
-  const auto &in0 = (*in_shape)[0];
-  auto &out0 = (*out_shape)[0];
-  CHECK_EQ(g_out_shape.ndim() + 1, in0.ndim());
-  out0 = in0;
-  for (size_t i = 1; i < out0.ndim(); i++)
-    out0[i] = g_out_shape[i - 1];
+  // For the shape of output data.
+  for (int i = 0; i < params.num_out_data; i++) {
+    uint32_t eid = idx.entry_id(g->outputs[i]);
+    const auto& g_out_shape = shapes[eid];
+    auto &out = (*out_shape)[i];
+    out = TShape(g_out_shape.ndim() + 1);
+    out[0] = len;
+    for (size_t i = 1; i < out.ndim(); i++)
+      out[i] = g_out_shape[i - 1];
+  }
 
   // For the remaining shapes.
-  for (size_t i = 1; i < g->outputs.size(); i++) {
+  for (size_t i = params.num_out_data; i < g->outputs.size(); i++) {
     uint32_t eid = idx.entry_id(g->outputs[i]);
     (*out_shape)[i] = shapes[eid];
   }
@@ -419,14 +459,13 @@ static bool ForeachType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(idx.outputs().size(), out_type->size());
   imperative::CheckAndInferType(g.get(), std::move(dtype_inputs), true);
 
-  size_t num_input_arrays = 1;
   const auto &dtypes = g->GetAttr<nnvm::DTypeVector>("dtype");
 
   // Inferring the data type in the subgraph may infer the data type of the inputs.
   // We need to copy the inferred input data types back.
   const auto &input_nids = idx.input_nodes();
   CHECK_EQ(input_nids.size(), in_type->size());
-  for (size_t i = num_input_arrays; i < in_type->size(); i++) {
+  for (size_t i = 0; i < in_type->size(); i++) {
     auto eid = idx.entry_id(input_nids[i], 0);
     (*in_type)[i] = dtypes[eid];
   }
@@ -455,14 +494,13 @@ static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
   imperative::CheckAndInferStorageType(g.get(), std::move(dev_masks),
                                        std::move(storage_type_inputs), true);
 
-  size_t num_input_arrays = 1;
   const auto& stypes = g->GetAttr<StorageTypeVector>("storage_type");
 
   // Inferring the storage in the subgraph may infer the storage of the inputs.
   // We need to copy the inferred input storage back.
   const auto &input_nids = idx.input_nodes();
   CHECK_EQ(input_nids.size(), in_attrs->size());
-  for (size_t i = num_input_arrays; i < in_attrs->size(); i++) {
+  for (size_t i = 0; i < in_attrs->size(); i++) {
     auto eid = idx.entry_id(input_nids[i], 0);
     (*in_attrs)[i] = stypes[eid];
   }
@@ -526,8 +564,8 @@ NNVM_REGISTER_OP(_foreach)
 .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", ForeachComputeExCPU)
 .set_attr<std::string>("key_var_num_args", "num_args")
 .add_argument("fn", "Symbol", "Input graph.")
-.add_argument("input", "NDArray-or-Symbol", "The input array where we iterate over.")
-.add_argument("states", "NDArray-or-Symbol[]", "The list of initial states.")
+.add_argument("inputs", "NDArray-or-Symbol[]",
+              "The input arrays that include data arrays and states.")
 .add_arguments(ForeachParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_backward_foreach)

From 98955a4e911e4b50a41728dd0ca3cac3ad67ee17 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Thu, 10 May 2018 23:21:32 +0000
Subject: [PATCH 037/135] fix tests.

---
 tests/python/unittest/test_operator.py | 140 ++++++++++++++++---------
 1 file changed, 93 insertions(+), 47 deletions(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 1ebe8341b16e..10356c648f80 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5937,53 +5937,105 @@ def test_float16_min_max():
     assert np.finfo('float16').max == mx.nd.max(a).asscalar()
 
 
+# TODO Test cases:
+# in an iteration, data is stored in any location.
+# # iterations: odd or even.
+# multiple inputs and multiple outputs.
+# test nested loop.
+
+
 @with_seed()
 def test_foreach():
-    v3 = mx.sym.var("v3")
-    v4 = mx.sym.var("v4")
-    v5 = mx.sym.var("v5")
+    v3 = mx.sym.var("v0")
+    v4 = mx.sym.var("v1")
+    v5 = mx.sym.var("v2")
 
     # This tests foreach with accumulation sum.
-    def step(in1, states):
-        out = in1 + states[0] + v5
+    def step1_sym(in1, states):
+        out = in1 * 2 + states[0] + v5
+        return (out, [out])
+    def step2_sym(in1, states):
+        out = states[0] + in1 * 2 + v5
         return (out, [out])
 
-    out = mx.sym.contrib.foreach(step, v3, [v4])
-    out1 = out[0] * 2
-    out = mx.sym.Group([out1, out[1][0]])
-    arr1 = mx.nd.random.uniform(shape=(2, 2))
-    arr2 = mx.nd.random.uniform(shape=(2))
-    arr3 = mx.nd.random.uniform(shape=(2))
-    arr_grad1 = mx.nd.empty(arr1.shape)
-    arr_grad2 = mx.nd.empty(arr2.shape)
-    arr_grad3 = mx.nd.empty(arr3.shape)
-    e = out.bind(ctx=mx.cpu(), args={'v3': arr1, 'v4': arr2, 'v5': arr3},
-            args_grad={'v3': arr_grad1, 'v4': arr_grad2, 'v5': arr_grad3})
-    e.forward(is_train=True)
-    # backward
-    out_grad = mx.nd.random.uniform(-10, 10, arr1.shape)
-    state_grad = mx.nd.random.uniform(-10, 10, arr2.shape)
-    e.backward([out_grad, state_grad])
-
-    res = []
-    arr1.attach_grad()
-    arr2.attach_grad()
-    arr3.attach_grad()
-    with mx.autograd.record():
-        for i in range(arr1.shape[0]):
-            if (i == 0):
-                tmp_res = mx.nd.expand_dims(arr2, 0) + mx.nd.expand_dims(arr1[i], 0) + mx.nd.expand_dims(arr3, 0)
-            else:
-                tmp_res = res[len(res) - 1] + mx.nd.expand_dims(arr1[i], 0) + mx.nd.expand_dims(arr3, 0)
-            res.append(tmp_res)
-        res1 = mx.nd.concat(*res, dim=0)
-        res2 = res1 * 2
-        res = mx.nd.concat(res2, tmp_res, dim=0)
-    res.backward(mx.nd.concat(out_grad, mx.nd.expand_dims(state_grad, 0), dim=0))
-    assert_almost_equal(e.outputs[0].asnumpy(), res2.asnumpy(), rtol=0.001, atol=0.0001)
-    assert_almost_equal(arr1.grad.asnumpy(), e.grad_arrays[0].asnumpy())
-    assert_almost_equal(arr2.grad.asnumpy(), e.grad_arrays[1].asnumpy())
-    assert_almost_equal(arr3.grad.asnumpy(), e.grad_arrays[2].asnumpy())
+    def step1(data, state, free):
+        return data * 2 + state + free
+
+    def step2(data, state, free):
+        return state + data * 2 + free
+
+    def verify_foreach(step_sym, step, in_arrs, init_states, out_grads):
+        out = mx.sym.contrib.foreach(step_sym, v3, [v4])
+        out1 = out[0] * 1
+        out = mx.sym.Group([out1, out[1][0]])
+        arr_grads = []
+        arg_dict = {}
+        arg_grad_dict = {}
+        i = 0
+        for arr in in_arrs:
+            arr_grad = mx.nd.empty(arr.shape)
+            arr_grads.append(arr_grad)
+            arg_dict['v'+str(i)] = arr
+            arg_grad_dict['v'+str(i)] = arr_grad
+            i = i + 1
+        for arr in init_states:
+            arr_grad = mx.nd.empty(arr.shape)
+            arr_grads.append(arr_grad)
+            arg_dict['v'+str(i)] = arr
+            arg_grad_dict['v'+str(i)] = arr_grad
+            i = i + 1
+
+        gin_order = []
+        for name in out.list_inputs():
+            name = name[1:]
+            gin_order.append(int(name))
+
+        e = out.bind(ctx=mx.cpu(), args=arg_dict, args_grad=arg_grad_dict)
+        e.forward(is_train=True)
+        # backward
+        e.backward(out_grads)
+
+        res = []
+        for arr in in_arrs:
+            arr.attach_grad()
+        with mx.autograd.record():
+            states = [mx.nd.expand_dims(s, 0) for s in init_states]
+            for i in range(in_arrs[0].shape[0]):
+                tmp_res = step(mx.nd.expand_dims(in_arrs[0][i], 0),
+                        states[0], states[1])
+                res.append(tmp_res)
+                states[0] = tmp_res
+            res1 = mx.nd.concat(*res, dim=0)
+            res2 = res1 * 1
+            res = mx.nd.concat(res2, tmp_res, dim=0)
+        res.backward(mx.nd.concat(out_grads[0], mx.nd.expand_dims(out_grads[1], 0), dim=0))
+        assert_almost_equal(e.outputs[0].asnumpy(), res2.asnumpy(), rtol=0.001, atol=0.0001)
+        for i in range(len(in_arrs)):
+            assert_almost_equal(in_arrs[i].grad.asnumpy(), e.grad_arrays[gin_order[i]].asnumpy())
+
+    # Test foreach with data in different locations among inputs,
+    # different numbers of iterations.
+    arrs = [mx.nd.random.uniform(shape=(2, 2))]
+    states = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2))]
+    out_grads = [mx.nd.random.uniform(-10, 10, arrs[0].shape),
+            mx.nd.random.uniform(-10, 10, states[0].shape)]
+    verify_foreach(step1_sym, step1, arrs, states, out_grads)
+
+    arrs = [mx.nd.random.uniform(shape=(3, 2))]
+    out_grads = [mx.nd.random.uniform(-10, 10, arrs[0].shape),
+            mx.nd.random.uniform(-10, 10, states[0].shape)]
+    verify_foreach(step1_sym, step1, arrs, states, out_grads)
+
+    arrs = [mx.nd.random.uniform(shape=(2, 2))]
+    states = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2))]
+    out_grads = [mx.nd.random.uniform(-10, 10, arrs[0].shape),
+            mx.nd.random.uniform(-10, 10, states[0].shape)]
+    verify_foreach(step2_sym, step2, arrs, states, out_grads)
+
+    arrs = [mx.nd.random.uniform(shape=(3, 2))]
+    out_grads = [mx.nd.random.uniform(-10, 10, arrs[0].shape),
+            mx.nd.random.uniform(-10, 10, states[0].shape)]
+    verify_foreach(step2_sym, step2, arrs, states, out_grads)
 
 
 @with_seed()
@@ -6081,12 +6133,6 @@ def sym_group(out):
         assert_almost_equal(e1.grad_arrays[i].asnumpy(), e2.grad_arrays[i].asnumpy())
 
 
-# TODO Test cases:
-# in an iteration, data is stored in any location.
-# # iterations: odd or even.
-# multiple inputs and multiple outputs.
-
-
 @with_seed()
 def test_squeeze_op():
     def check_squeeze_op(shape, axis=None):

From d8c9b1f1670847a14dd99b41b36301d22f76102b Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 11 May 2018 17:17:37 +0000
Subject: [PATCH 038/135] update tests.

---
 tests/python/unittest/test_operator.py | 149 ++++++++++++++++---------
 1 file changed, 98 insertions(+), 51 deletions(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 10356c648f80..a5a5fe948093 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -24,7 +24,7 @@
 import itertools
 from numpy.testing import assert_allclose, assert_array_equal
 from mxnet.test_utils import *
-from mxnet.base import py_str, MXNetError
+from mxnet.base import py_str, MXNetError, _as_list
 from common import setup_module, with_seed, teardown
 import unittest
 
@@ -5949,30 +5949,35 @@ def test_foreach():
     v3 = mx.sym.var("v0")
     v4 = mx.sym.var("v1")
     v5 = mx.sym.var("v2")
+    v6 = mx.sym.var("v3")
+    v7 = mx.sym.var("v4")
 
     # This tests foreach with accumulation sum.
-    def step1_sym(in1, states):
-        out = in1 * 2 + states[0] + v5
+    def step1(in1, states, free):
+        out = in1 * 2 + states[0] + free[0]
         return (out, [out])
-    def step2_sym(in1, states):
-        out = states[0] + in1 * 2 + v5
+    def step2(in1, states, free):
+        out = states[0] + in1 * 2 + free[0]
         return (out, [out])
-
-    def step1(data, state, free):
-        return data * 2 + state + free
-
-    def step2(data, state, free):
-        return state + data * 2 + free
-
-    def verify_foreach(step_sym, step, in_arrs, init_states, out_grads):
-        out = mx.sym.contrib.foreach(step_sym, v3, [v4])
-        out1 = out[0] * 1
-        out = mx.sym.Group([out1, out[1][0]])
+    def step3(in1, states, free):
+        out = in1[0] + in1[1] + states[0] + states[1] + free[0]
+        return ([out, out * 2], [out * 2, out * 3])
+
+    def verify_foreach(step, in_syms, state_syms, free_syms,
+            in_arrs, init_states, frees, out_grads):
+        step_sym = lambda in_syms, state_syms : step(in_syms, state_syms, free_syms)
+        step_imp = lambda in_arrs, state_arrs : step(in_arrs, state_arrs, frees)
+        out = mx.sym.contrib.foreach(step_sym, in_syms, state_syms)
+        out1 = _as_list(out[0])
+        for i in range(len(out1)):
+            out1[i] = out1[i] * 2
+        out1.extend(out[1])
+        out = mx.sym.Group(out1)
         arr_grads = []
         arg_dict = {}
         arg_grad_dict = {}
         i = 0
-        for arr in in_arrs:
+        for arr in _as_list(in_arrs):
             arr_grad = mx.nd.empty(arr.shape)
             arr_grads.append(arr_grad)
             arg_dict['v'+str(i)] = arr
@@ -5984,6 +5989,12 @@ def verify_foreach(step_sym, step, in_arrs, init_states, out_grads):
             arg_dict['v'+str(i)] = arr
             arg_grad_dict['v'+str(i)] = arr_grad
             i = i + 1
+        for arr in frees:
+            arr_grad = mx.nd.empty(arr.shape)
+            arr_grads.append(arr_grad)
+            arg_dict['v'+str(i)] = arr
+            arg_grad_dict['v'+str(i)] = arr_grad
+            i = i + 1
 
         gin_order = []
         for name in out.list_inputs():
@@ -5993,49 +6004,85 @@ def verify_foreach(step_sym, step, in_arrs, init_states, out_grads):
         e = out.bind(ctx=mx.cpu(), args=arg_dict, args_grad=arg_grad_dict)
         e.forward(is_train=True)
         # backward
-        e.backward(out_grads)
+        tmp_grads = out_grads[0][:]
+        tmp_grads.extend(out_grads[1])
+        e.backward(tmp_grads)
 
+        # Below we use imperative to reimplement foreach and compute its gradients.
         res = []
-        for arr in in_arrs:
+        for i in range(len(_as_list(out_grads[0]))):
+            res.append([])
+        for arr in _as_list(in_arrs):
+            arr.attach_grad()
+        for arr in init_states:
+            arr.attach_grad()
+        for arr in frees:
             arr.attach_grad()
         with mx.autograd.record():
             states = [mx.nd.expand_dims(s, 0) for s in init_states]
-            for i in range(in_arrs[0].shape[0]):
-                tmp_res = step(mx.nd.expand_dims(in_arrs[0][i], 0),
-                        states[0], states[1])
-                res.append(tmp_res)
-                states[0] = tmp_res
-            res1 = mx.nd.concat(*res, dim=0)
-            res2 = res1 * 1
-            res = mx.nd.concat(res2, tmp_res, dim=0)
-        res.backward(mx.nd.concat(out_grads[0], mx.nd.expand_dims(out_grads[1], 0), dim=0))
-        assert_almost_equal(e.outputs[0].asnumpy(), res2.asnumpy(), rtol=0.001, atol=0.0001)
-        for i in range(len(in_arrs)):
-            assert_almost_equal(in_arrs[i].grad.asnumpy(), e.grad_arrays[gin_order[i]].asnumpy())
+            if isinstance(in_arrs, list):
+                num_iters = in_arrs[0].shape[0]
+            else:
+                num_iters = in_arrs.shape[0]
+
+            for i in range(num_iters):
+                if isinstance(in_arrs, list):
+                    data = [mx.nd.expand_dims(arr[i], 0) for arr in in_arrs]
+                else:
+                    data = mx.nd.expand_dims(in_arrs[i], 0)
+                tmp_res = step_imp(data, states)
+                tmp_res1 = _as_list(tmp_res[0])
+                for i in range(len(tmp_res1)):
+                    res[i].append(tmp_res1[i])
+                states = tmp_res[1]
+            res2 = []
+            for l in res:
+                res2.append(mx.nd.concat(*l, dim=0) * 2)
+            tmp_res2 = res2[:]
+            tmp_res2.extend(tmp_res[1])
+            res = mx.nd.concat(*tmp_res2, dim=0)
+
+        tmp_grads = out_grads[0][:]
+        tmp_grads1 = [mx.nd.expand_dims(grad, 0) for grad in out_grads[1]]
+        tmp_grads.extend(tmp_grads1)
+        res.backward(mx.nd.concat(*tmp_grads, dim=0))
+        for i in range(len(res2)):
+            assert_almost_equal(e.outputs[i].asnumpy(), res2[i].asnumpy(), rtol=0.001, atol=0.0001)
+        all_ins = _as_list(in_arrs)
+        all_ins.extend(init_states)
+        all_ins.extend(frees)
+        for i in range(len(all_ins)):
+            assert_almost_equal(all_ins[i].grad.asnumpy(), e.grad_arrays[gin_order[i]].asnumpy())
 
     # Test foreach with data in different locations among inputs,
     # different numbers of iterations.
-    arrs = [mx.nd.random.uniform(shape=(2, 2))]
-    states = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2))]
-    out_grads = [mx.nd.random.uniform(-10, 10, arrs[0].shape),
-            mx.nd.random.uniform(-10, 10, states[0].shape)]
-    verify_foreach(step1_sym, step1, arrs, states, out_grads)
-
-    arrs = [mx.nd.random.uniform(shape=(3, 2))]
-    out_grads = [mx.nd.random.uniform(-10, 10, arrs[0].shape),
-            mx.nd.random.uniform(-10, 10, states[0].shape)]
-    verify_foreach(step1_sym, step1, arrs, states, out_grads)
-
-    arrs = [mx.nd.random.uniform(shape=(2, 2))]
+    states = [mx.nd.random.uniform(shape=(2))]
+    frees = [mx.nd.random.uniform(shape=(2))]
+    arrs = mx.nd.random.uniform(shape=(2, 2))
+    out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],
+            [mx.nd.random.uniform(-10, 10, states[0].shape)]]
+    verify_foreach(step1, v3, [v4], [v5], arrs, states, frees, out_grads)
+
+    arrs = mx.nd.random.uniform(shape=(3, 2))
+    out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],
+            [mx.nd.random.uniform(-10, 10, states[0].shape)]]
+    verify_foreach(step1, v3, [v4], [v5], arrs, states, frees, out_grads)
+
+    arrs = mx.nd.random.uniform(shape=(2, 2))
+    out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],
+            [mx.nd.random.uniform(-10, 10, states[0].shape)]]
+    verify_foreach(step2, v3, [v4], [v5], arrs, states, frees, out_grads)
+
+    arrs = mx.nd.random.uniform(shape=(3, 2))
+    out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],
+            [mx.nd.random.uniform(-10, 10, states[0].shape)]]
+    verify_foreach(step2, v3, [v4], [v5], arrs, states, frees, out_grads)
+
+    arrs = [mx.nd.random.uniform(shape=(3, 2)), mx.nd.random.uniform(shape=(3, 2))]
     states = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2))]
-    out_grads = [mx.nd.random.uniform(-10, 10, arrs[0].shape),
-            mx.nd.random.uniform(-10, 10, states[0].shape)]
-    verify_foreach(step2_sym, step2, arrs, states, out_grads)
-
-    arrs = [mx.nd.random.uniform(shape=(3, 2))]
-    out_grads = [mx.nd.random.uniform(-10, 10, arrs[0].shape),
-            mx.nd.random.uniform(-10, 10, states[0].shape)]
-    verify_foreach(step2_sym, step2, arrs, states, out_grads)
+    out_grads = [[mx.nd.random.uniform(-10, 10, arrs[0].shape), mx.nd.random.uniform(-10, 10, arrs[1].shape)],
+            [mx.nd.random.uniform(-10, 10, states[0].shape), mx.nd.random.uniform(-10, 10, states[1].shape)]]
+    verify_foreach(step3, [v3, v4], [v5, v6], [v7], arrs, states, frees, out_grads)
 
 
 @with_seed()

From 32e3b17b9684d7ca67cd85d197d30ad30d455e47 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sat, 12 May 2018 01:28:35 +0000
Subject: [PATCH 039/135] check state shape.

---
 src/operator/nn/control_flow.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 6fe3675dfa5d..dfb00ac79c40 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -157,6 +157,8 @@ void ForeachState::Forward(std::vector<NDArray> cinputs,
                                                           arg_names, params);
   // TODO here we only changed the output arrays in the arguments.
   // Will this be a problem?
+  // TODO we need to avoid shape inference and memory plan whenever the op is
+  // called.
   op->Forward(nullptr, inputs, outputs);
 
   if (is_recording) {
@@ -442,6 +444,11 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
     uint32_t eid = idx.entry_id(g->outputs[i]);
     (*out_shape)[i] = shapes[eid];
   }
+  size_t num_states = g->outputs.size() - params.num_out_data;
+  for (size_t i = 0; i < num_states; i++) {
+    size_t loc = params.in_state_locs[i];
+    CHECK((*out_shape)[i + params.num_out_data] == (*in_shape)[loc]);
+  }
   return true;
 }
 

From 8caf708fd3afa48ea5ec1fd94e69ff1d307b884f Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Mon, 14 May 2018 23:39:04 +0000
Subject: [PATCH 040/135] enable nested foreach.

---
 src/imperative/imperative_utils.h      |  18 ++++-
 tests/python/unittest/test_operator.py | 101 ++++++++++++++++++++-----
 2 files changed, 94 insertions(+), 25 deletions(-)

diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 691f0822b082..479e6b62fbed 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -478,12 +478,18 @@ inline void PushOperator(const OpStatePtr& state,
       InvalidateOutputs(outputs, req);
 #endif
       fcompute_ex(state, opctx, inputs, req, outputs);
-      if (ctx.dev_mask() == gpu::kDevMask && exec_type == ExecType::kSync) {
+      if (ctx.dev_mask() == gpu::kDevMask && exec_type == ExecType::kSync
+          && rctx.get_stream<gpu>()) {
         rctx.get_stream<gpu>()->Wait();
       }
     };
 
-    if (exec_type == ExecType::kSync) {
+    // For operators with subgraphs, we need to invoke them in the main thread
+    // instead of the threaded engine.
+    if (!attrs.subgraphs.empty()) {
+      RunContext rctx{ctx, nullptr};
+      run(rctx, engine::CallbackOnComplete());
+    } else if (exec_type == ExecType::kSync) {
       Engine::Get()->PushSync(
           [=](RunContext rctx) { run(rctx, engine::CallbackOnComplete()); },
           ctx, read_vars, write_vars, FnProperty::kNormal, 0,
@@ -523,12 +529,16 @@ inline void PushOperator(const OpStatePtr& state,
         fcompute(state, opctx, input_blobs, tmp_req, output_blobs);
         // post-fcompute fallback, cast to original storage type, if necessary
         CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx, is_gpu);
-        if (is_gpu && exec_type == ExecType::kSync) {
+        if (is_gpu && exec_type == ExecType::kSync
+            && rctx.get_stream<gpu>()) {
           rctx.get_stream<gpu>()->Wait();
         }
       };
 
-    if (exec_type == ExecType::kSync) {
+    if (!attrs.subgraphs.empty()) {
+      RunContext rctx{ctx, nullptr};
+      run(rctx, engine::CallbackOnComplete());
+    } else if (exec_type == ExecType::kSync) {
       Engine::Get()->PushSync(
           [=](RunContext rctx) {
             run(rctx, engine::CallbackOnComplete());
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index a5a5fe948093..a71ffdf95698 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5937,13 +5937,6 @@ def test_float16_min_max():
     assert np.finfo('float16').max == mx.nd.max(a).asscalar()
 
 
-# TODO Test cases:
-# in an iteration, data is stored in any location.
-# # iterations: odd or even.
-# multiple inputs and multiple outputs.
-# test nested loop.
-
-
 @with_seed()
 def test_foreach():
     v3 = mx.sym.var("v0")
@@ -5964,7 +5957,7 @@ def step3(in1, states, free):
         return ([out, out * 2], [out * 2, out * 3])
 
     def verify_foreach(step, in_syms, state_syms, free_syms,
-            in_arrs, init_states, frees, out_grads):
+            in_arrs, init_states, frees, out_grads, is_train=True):
         step_sym = lambda in_syms, state_syms : step(in_syms, state_syms, free_syms)
         step_imp = lambda in_arrs, state_arrs : step(in_arrs, state_arrs, frees)
         out = mx.sym.contrib.foreach(step_sym, in_syms, state_syms)
@@ -6001,12 +5994,14 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
             name = name[1:]
             gin_order.append(int(name))
 
-        e = out.bind(ctx=mx.cpu(), args=arg_dict, args_grad=arg_grad_dict)
-        e.forward(is_train=True)
-        # backward
-        tmp_grads = out_grads[0][:]
-        tmp_grads.extend(out_grads[1])
-        e.backward(tmp_grads)
+        e = out.bind(ctx=mx.cpu(), args=arg_dict, args_grad=arg_grad_dict,
+                )
+        e.forward(is_train=is_train)
+        if (is_train):
+            # backward
+            tmp_grads = out_grads[0][:]
+            tmp_grads.extend(out_grads[1])
+            e.backward(tmp_grads)
 
         # Below we use imperative to reimplement foreach and compute its gradients.
         res = []
@@ -6045,14 +6040,26 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
         tmp_grads = out_grads[0][:]
         tmp_grads1 = [mx.nd.expand_dims(grad, 0) for grad in out_grads[1]]
         tmp_grads.extend(tmp_grads1)
-        res.backward(mx.nd.concat(*tmp_grads, dim=0))
+        if (is_train):
+            res.backward(mx.nd.concat(*tmp_grads, dim=0))
         for i in range(len(res2)):
-            assert_almost_equal(e.outputs[i].asnumpy(), res2[i].asnumpy(), rtol=0.001, atol=0.0001)
-        all_ins = _as_list(in_arrs)
-        all_ins.extend(init_states)
-        all_ins.extend(frees)
-        for i in range(len(all_ins)):
-            assert_almost_equal(all_ins[i].grad.asnumpy(), e.grad_arrays[gin_order[i]].asnumpy())
+            assert_almost_equal(e.outputs[i].asnumpy(), res2[i].asnumpy(),
+                    rtol=0.001, atol=0.0001)
+        if (is_train):
+            all_ins = _as_list(in_arrs)
+            all_ins.extend(init_states)
+            all_ins.extend(frees)
+            for i in range(len(all_ins)):
+                assert_almost_equal(all_ins[i].grad.asnumpy(),
+                        e.grad_arrays[gin_order[i]].asnumpy())
+
+    # Test cases:
+    # * graph inputs are stored in different orders.
+    #   This is to test if foreach finds the data arrays and weight arrays
+    #   in the right location.
+    # * the number of iterations: odd or even.
+    # * multiple inputs and multiple outputs.
+    # * inference.
 
     # Test foreach with data in different locations among inputs,
     # different numbers of iterations.
@@ -6062,27 +6069,79 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
     out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],
             [mx.nd.random.uniform(-10, 10, states[0].shape)]]
     verify_foreach(step1, v3, [v4], [v5], arrs, states, frees, out_grads)
+    verify_foreach(step1, v3, [v4], [v5], arrs, states, frees, out_grads, False)
 
     arrs = mx.nd.random.uniform(shape=(3, 2))
     out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],
             [mx.nd.random.uniform(-10, 10, states[0].shape)]]
     verify_foreach(step1, v3, [v4], [v5], arrs, states, frees, out_grads)
+    verify_foreach(step1, v3, [v4], [v5], arrs, states, frees, out_grads, False)
 
     arrs = mx.nd.random.uniform(shape=(2, 2))
     out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],
             [mx.nd.random.uniform(-10, 10, states[0].shape)]]
     verify_foreach(step2, v3, [v4], [v5], arrs, states, frees, out_grads)
+    verify_foreach(step2, v3, [v4], [v5], arrs, states, frees, out_grads, False)
 
     arrs = mx.nd.random.uniform(shape=(3, 2))
     out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],
             [mx.nd.random.uniform(-10, 10, states[0].shape)]]
     verify_foreach(step2, v3, [v4], [v5], arrs, states, frees, out_grads)
+    verify_foreach(step2, v3, [v4], [v5], arrs, states, frees, out_grads, False)
 
     arrs = [mx.nd.random.uniform(shape=(3, 2)), mx.nd.random.uniform(shape=(3, 2))]
     states = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2))]
     out_grads = [[mx.nd.random.uniform(-10, 10, arrs[0].shape), mx.nd.random.uniform(-10, 10, arrs[1].shape)],
             [mx.nd.random.uniform(-10, 10, states[0].shape), mx.nd.random.uniform(-10, 10, states[1].shape)]]
     verify_foreach(step3, [v3, v4], [v5, v6], [v7], arrs, states, frees, out_grads)
+    verify_foreach(step3, [v3, v4], [v5, v6], [v7], arrs, states, frees, out_grads, False)
+
+
+@with_seed()
+def test_foreach_nested():
+    # Test nested foreach.
+    def step_in(in1, states):
+        out = in1 * 2 + states[0]
+        return (out, [out])
+
+    def step(in1, states):
+        out1 = mx.sym.contrib.foreach(step_in, in1, states)
+        out = mx.sym.broadcast_add(out1[0], states[0])
+        return (out, [mx.sym.squeeze(mx.sym.slice(out, begin=(0, 0), end=(1, 2)))])
+
+    data_sym = mx.sym.var("v1")
+    state_sym = mx.sym.var("v2")
+    out = mx.sym.contrib.foreach(step, data_sym, [state_sym])
+
+    out1 = _as_list(out[0])
+    for i in range(len(out1)):
+        out1[i] = out1[i]
+    out1.extend(out[1])
+    out = mx.sym.Group(out1)
+
+    data = mx.nd.arange(4).reshape((1, 2, 2))
+    state = mx.nd.arange(2)
+    data_grad = mx.nd.empty(data.shape)
+    state_grad = mx.nd.empty(state.shape)
+    e = out.bind(ctx=mx.cpu(), args={'v1':data, 'v2':state},
+            args_grad={'v1':data_grad, 'v2':state_grad})
+    e.forward(is_train=True)
+    out = mx.nd.zeros_like(data)
+    for i in range(data.shape[0]):
+        data1 = data[i]
+        out1 = mx.nd.zeros_like(data1)
+        for j in range(data1.shape[0]):
+            if (j > 0):
+                out1[j] = out1[j-1] + data1[j] * 2
+            else:
+                out1[j] = data1[j] * 2  + state
+        if (i > 0):
+            state = mx.nd.squeeze(mx.nd.slice(out[i-1], begin=(0, 0), end=(1, 2)))
+            out[i] = mx.nd.broadcast_add(out1, state)
+        else:
+            out[i] = mx.nd.broadcast_add(out1, state)
+    out = out
+    assert_almost_equal(out.asnumpy(), e.outputs[0].asnumpy(), rtol=0.001, atol=0.0001)
 
 
 @with_seed()

From d4ef381020c6e7867e64e46959afe321a5c16842 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 16 May 2018 23:09:33 +0000
Subject: [PATCH 041/135] remove print.

---
 src/c_api/c_api_symbolic.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 18e03544caf1..fd2dd7129bc9 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -362,7 +362,6 @@ int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle **out_arr, int *out_s
         nnvm::Symbol *s = new nnvm::Symbol();
         s->outputs.push_back(e);
         input_syms.push_back(s);
-        std::cout << p->attrs.name << std::endl;
       }
     }
   }

From 427003235edab4c9871c025a06f8e811c7993518 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Thu, 17 May 2018 14:32:25 +0000
Subject: [PATCH 042/135] fix a bug in test.

---
 tests/python/unittest/test_operator.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index a71ffdf95698..e69f238fbbfd 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -6046,7 +6046,7 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
             assert_almost_equal(e.outputs[i].asnumpy(), res2[i].asnumpy(),
                     rtol=0.001, atol=0.0001)
         if (is_train):
-            all_ins = _as_list(in_arrs)
+            all_ins = _as_list(in_arrs)[:]
             all_ins.extend(init_states)
             all_ins.extend(frees)
             for i in range(len(all_ins)):
@@ -6061,8 +6061,6 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
     # * multiple inputs and multiple outputs.
     # * inference.
 
-    # Test foreach with data in different locations among inputs,
-    # different numbers of iterations.
     states = [mx.nd.random.uniform(shape=(2))]
     frees = [mx.nd.random.uniform(shape=(2))]
     arrs = mx.nd.random.uniform(shape=(2, 2))
@@ -6089,6 +6087,7 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
     verify_foreach(step2, v3, [v4], [v5], arrs, states, frees, out_grads)
     verify_foreach(step2, v3, [v4], [v5], arrs, states, frees, out_grads, False)
 
+    # Test multiple inputs and outputs.
     arrs = [mx.nd.random.uniform(shape=(3, 2)), mx.nd.random.uniform(shape=(3, 2))]
     states = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2))]
     out_grads = [[mx.nd.random.uniform(-10, 10, arrs[0].shape), mx.nd.random.uniform(-10, 10, arrs[1].shape)],

From b54f234e8ce5d7a8b42ff93abefbb0872eb1fb6e Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 18 May 2018 06:18:28 +0000
Subject: [PATCH 043/135] handle infer storage type for backward.

---
 src/operator/nn/control_flow.cc | 96 +++++++++++++++++++++++++++------
 1 file changed, 81 insertions(+), 15 deletions(-)

diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index dfb00ac79c40..87fb48cfb749 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -157,8 +157,10 @@ void ForeachState::Forward(std::vector<NDArray> cinputs,
                                                           arg_names, params);
   // TODO here we only changed the output arrays in the arguments.
   // Will this be a problem?
-  // TODO we need to avoid shape inference and memory plan whenever the op is
-  // called.
+  // TODO(zhengda) we need to avoid shape inference and memory plan whenever the op is
+  // called. Currently, CachedOp allocates memory each time Forward is called.
+  // I need to fix this once the PR for static memory allocation in CachedOp is
+  // merged. https://github.com/apache/incubator-mxnet/pull/10817
   op->Forward(nullptr, inputs, outputs);
 
   if (is_recording) {
@@ -410,7 +412,6 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(attrs.subgraphs.size(), 1U);
   auto g = std::make_shared<nnvm::Graph>();
   g->outputs = attrs.subgraphs[0]->outputs;
-  // TODO(zhengda) We should avoid creating an index graph so many times.
   const auto& idx = g->indexed_graph();
   CHECK_EQ(idx.input_nodes().size(), in_shape->size());
   CHECK_EQ(idx.outputs().size(), out_shape->size());
@@ -460,7 +461,6 @@ static bool ForeachType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(attrs.subgraphs.size(), 1U);
   auto g = std::make_shared<nnvm::Graph>();
   g->outputs = attrs.subgraphs[0]->outputs;
-  // TODO(zhengda) We should avoid creating an index graph so many times.
   const auto& idx = g->indexed_graph();
   CHECK_EQ(idx.input_nodes().size(), in_type->size());
   CHECK_EQ(idx.outputs().size(), out_type->size());
@@ -492,7 +492,6 @@ static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(attrs.subgraphs.size(), 1U);
   auto g = std::make_shared<nnvm::Graph>();
   g->outputs = attrs.subgraphs[0]->outputs;
-  // TODO(zhengda) We should avoid creating an index graph so many times.
   const auto& idx = g->indexed_graph();
   CHECK_EQ(idx.input_nodes().size(), in_attrs->size());
   CHECK_EQ(idx.outputs().size(), out_attrs->size());
@@ -525,9 +524,69 @@ static bool BackwardForeachStorageType(const nnvm::NodeAttrs& attrs,
                                        DispatchMode* dispatch_mode,
                                        std::vector<int> *in_attrs,
                                        std::vector<int> *out_attrs) {
-  // TODO I need to set storage type properly.
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, DispatchMode::kFComputeEx);
+  using namespace nnvm;
+  const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
+  CHECK_EQ(out_attrs->size(), (size_t) params.num_args - 1);
+  CHECK_EQ(attrs.subgraphs.size(), 1U);
+  // construct backward graph
+  nnvm::Graph grad_graph;
+  nnvm::Graph fwd_graph;
+  std::vector<Node *> potential_nodes;
+  {
+    fwd_graph.outputs = attrs.subgraphs[0]->outputs;
+    std::vector<nnvm::NodeEntry> ograd_entries;
+    ograd_entries.reserve(fwd_graph.outputs.size());
+    for (size_t i = 0; i < fwd_graph.outputs.size(); ++i) {
+      ograd_entries.emplace_back(NodeEntry{Node::Create(), 0, 0});
+    }
+    nnvm::Symbol subgraph_sym = *attrs.subgraphs[0];
+
+    std::vector<NodeEntry> xs;
+    std::vector<NodePtr> args = subgraph_sym.ListInputs(nnvm::Symbol::kReadOnlyArgs);
+    xs.reserve(args.size());
+    for (const auto& i : args)
+      xs.emplace_back(NodeEntry{i, 0, 0});
+    CHECK_GT(xs.size(), 0)
+        << "There are no inputs in computation graph that require gradients.";
+
+    static const std::vector<const Op*> zero_ops{Op::Get("zeros_like"), Op::Get("_zeros")};
+    grad_graph = pass::Gradient(
+        fwd_graph, fwd_graph.outputs, xs, ograd_entries,
+        exec::AggregateGradient, nullptr, nullptr,
+        zero_ops, "_copy");
+    potential_nodes.reserve(fwd_graph.outputs.size() + xs.size() + ograd_entries.size());
+    for (auto e : ograd_entries)
+      potential_nodes.push_back(e.node.get());
+    for (auto e : xs)
+      potential_nodes.push_back(e.node.get());
+    for (auto e : fwd_graph.outputs)
+      potential_nodes.push_back(e.node.get());
+  }
+
+  const auto& idx = grad_graph.indexed_graph();
+  auto input_nodes = idx.input_nodes();
+  StorageTypeVector storage_type_inputs(input_nodes.size());
+  for (size_t i = 0; i < input_nodes.size(); i++) {
+    auto node_id = input_nodes[i];
+    const nnvm::IndexedGraph::Node &n = idx[node_id];
+    auto it = std::find(potential_nodes.begin(), potential_nodes.end(), n.source);
+    CHECK(it != potential_nodes.end());
+    size_t idx = it - potential_nodes.begin();
+    CHECK_LT(idx, in_attrs->size());
+    storage_type_inputs[i] = in_attrs->at(idx);
+  }
+  CHECK_EQ(idx.outputs().size(), out_attrs->size());
+  exec::DevMaskVector dev_masks(idx.num_nodes(), dev_mask);
+  imperative::CheckAndInferStorageType(&grad_graph, std::move(dev_masks),
+                                       std::move(storage_type_inputs), true);
+
+  const auto& stypes = grad_graph.GetAttr<StorageTypeVector>("storage_type");
+  *dispatch_mode = DispatchMode::kFComputeEx;
+  auto &outputs = idx.outputs();
+  CHECK(outputs.size() == out_attrs->size());
+  for (size_t i = 0; i < out_attrs->size(); i++)
+    (*out_attrs)[i] = stypes[idx.entry_id(outputs[i])];
+  return true;
 }
 
 static OpStatePtr CreateForeachState(const NodeAttrs& attrs,
@@ -538,10 +597,12 @@ static OpStatePtr CreateForeachState(const NodeAttrs& attrs,
   return OpStatePtr::Create<ForeachState>(*attrs.subgraphs[0], params);
 }
 
-void ForeachParamParser(nnvm::NodeAttrs* attrs) {
-  ParamParser<ForeachParam>(attrs);
-  // This is to indicate that the operator has a subgraph.
-  attrs->subgraphs.resize(1);
+static std::vector<nnvm::NodeEntry>
+ForeachGradient(const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+  ElemwiseGradUseInOut fgrad{"_backward_foreach"};
+  std::vector<nnvm::NodeEntry> entries = fgrad(n, ograds);
+  entries[0].node->attrs.subgraphs = n->attrs.subgraphs;
+  return entries;
 }
 
 NNVM_REGISTER_OP(_foreach)
@@ -558,13 +619,18 @@ NNVM_REGISTER_OP(_foreach)
 })
 .set_attr<nnvm::FListInputNames>("FListInputNames",
     [](const NodeAttrs& attrs) {
-  return std::vector<std::string>{"fn", "data1", "data2"};
+  const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
+  std::vector<std::string> names;
+  names.push_back("fn");
+  for (int i = 0; i < params.num_args - 1; i++)
+    names.push_back("data" + std::to_string(i));
+  return names;
 })
 .set_attr<nnvm::FInputGraph>("FInputGraph",
     [](const NodeAttrs& attrs) {
   return std::vector<uint32_t>{0};
 })
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseInOut{"_backward_foreach"})
+.set_attr<nnvm::FGradient>("FGradient", ForeachGradient)
 .set_attr<FCreateOpState>("FCreateOpState", CreateForeachState)
 .set_attr<nnvm::FInferShape>("FInferShape", ForeachShape)
 .set_attr<nnvm::FInferType>("FInferType", ForeachType)
@@ -585,7 +651,7 @@ NNVM_REGISTER_OP(_backward_foreach)
   return params.num_args - 1;
   })
 .set_attr<FInferStorageType>("FInferStorageType", BackwardForeachStorageType)
-.set_attr_parser(ForeachParamParser)
+.set_attr_parser(ParamParser<ForeachParam>)
 .set_attr<bool>("TIsLayerOpBackward", true)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", ForeachGradComputeExCPU);

From 14d319b3c2d4bcbce755fe36805eafff3428a78e Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 18 May 2018 06:31:48 +0000
Subject: [PATCH 044/135] address comments.

---
 python/mxnet/symbol/contrib.py  | 31 ++++++++++++++++++-------------
 src/c_api/c_api_symbolic.cc     |  2 +-
 src/executor/exec_pass.h        |  4 +---
 src/operator/nn/control_flow.cc |  4 ++--
 4 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 1d82cdc92763..c236dffa63cd 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -111,14 +111,19 @@ def _get_graph_inputs(subg, name, prefix):
         syms.append(s)
     return syms
 
-def foreach(func, input, init_states, back_prop=False, name="foreach"):
+def foreach(func, data, init_states, back_prop=False, name="foreach"):
     assert isinstance(init_states, list), "init_states should be a list"
     states = []
+
+    # TODO(zhengda) If the input python function references to the symbols outside
+    # the python function, we need to prune the computation graph constructed from
+    # the function. One way of doing it is to mark the nodes in the computation graph
+    # with AttrScope and prune the nodes without the special attribute.
     with AttrScope(subgraph_name=name):
-        if isinstance(input, list):
-            in_eles = [symbol.var(sym.name) for sym in input]
+        if isinstance(data, list):
+            in_eles = [symbol.var(sym.name) for sym in data]
         else:
-            in_eles = symbol.var(input.name)
+            in_eles = symbol.var(data.name)
         for s in init_states:
             states.append(symbol.var(s.name))
 
@@ -132,21 +137,21 @@ def foreach(func, input, init_states, back_prop=False, name="foreach"):
                 "the number of output states (%d) should be the same as input states (%d)" \
                 % (len(sym_out[1]), len(init_states))
 
-        if (isinstance(sym_out[0], list)):
+        if isinstance(sym_out[0], list):
             flat_out = sym_out[0]
         else:
             flat_out = [sym_out[0]]
         num_out_data = len(flat_out)
         for s in sym_out[1]:
             # There is a problem if the outputs are the same as the inputs
-            # or the first output.
-            # TODO this is a temp fix.
+            # or the first output. By calling identity, we can make sure that
+            # all symbols will refer to different NDArrays.
             flat_out.append(symbol.op.identity(s))
     g = symbol.Group(flat_out)
     input_syms = _get_graph_inputs(g, name, "ro_var")
 
-    if (isinstance(input, list)):
-        num_inputs = len(input)
+    if isinstance(data, list):
+        num_inputs = len(data)
     else:
         num_inputs = 1
 
@@ -161,7 +166,7 @@ def foreach(func, input, init_states, back_prop=False, name="foreach"):
     ordered_ins = []
     states_map = {sym.name:sym for sym in init_states}
     state_names = states_map.keys()
-    data_syms = _as_list(input)
+    data_syms = _as_list(data)
     data_map = {sym.name:sym for sym in data_syms}
     data_names = data_map.keys()
     in_state_locs = []
@@ -169,10 +174,10 @@ def foreach(func, input, init_states, back_prop=False, name="foreach"):
     for in_name in g.list_inputs():
         assert in_name in gin_names, "The input variable %s can't be found in graph inputs: %s" \
                 % (in_name, str(gin_names))
-        if (in_name in state_names):
+        if in_name in state_names:
             ordered_ins.append(states_map[in_name])
             in_state_locs.append(len(ordered_ins) - 1)
-        elif (in_name in data_names):
+        elif in_name in data_names:
             ordered_ins.append(data_map[in_name])
             in_data_locs.append(len(ordered_ins) - 1)
         else:
@@ -183,7 +188,7 @@ def foreach(func, input, init_states, back_prop=False, name="foreach"):
     ret = symbol._internal._foreach(g, *ordered_ins, num_outputs=num_outputs,
                                     num_out_data=num_out_data, in_state_locs=in_state_locs,
                                     in_data_locs=in_data_locs)
-    if (num_outputs - num_states > 1):
+    if num_outputs - num_states > 1:
         outs = []
         for i in range(num_outputs - num_states):
             outs.append(ret[i])
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index fd2dd7129bc9..0829a25c3e7c 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -368,7 +368,7 @@ int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle **out_arr, int *out_s
   CHECK(input_syms.size() <= max_out_size);
   *out_size = input_syms.size();
   memcpy(out_arr, input_syms.data(), sizeof(*out_arr) * input_syms.size());
-  API_END();
+  API_END_HANDLE_ERROR();
 }
 
 int MXSymbolCreateFromFile(const char *fname, SymbolHandle *out) {
diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h
index 6f4a5611c1bf..3cc7e73c39d0 100644
--- a/src/executor/exec_pass.h
+++ b/src/executor/exec_pass.h
@@ -64,9 +64,7 @@ class OpExecutor {
   OpContext op_ctx;
   /*! \brief virtual destructor */
   virtual ~OpExecutor() {}
-  virtual bool HasSubgraph() const {
-    return false;
-  }
+  virtual bool HasSubgraph() const = 0;
   /*!
    * \brief Setup the executor for given NDArray member
    * this can be called multiple times if NDArray changed during reshape.
diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 87fb48cfb749..a454fb32d296 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -328,7 +328,7 @@ static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
   std::unordered_set<size_t> in_state_locs(params.in_state_locs.begin(),
                                            params.in_state_locs.end());
   // The inputs contain out gradients, inputs and outputs.
-  size_t len = inputs[0].shape()[iter_dim];
+  int len = inputs[0].shape()[iter_dim];
   size_t num_output_data = params.num_out_data;
 
   // In backward computation, we need to run iterations from backwards.
@@ -637,7 +637,7 @@ NNVM_REGISTER_OP(_foreach)
 .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", ForeachComputeExCPU)
 .set_attr<std::string>("key_var_num_args", "num_args")
 .add_argument("fn", "Symbol", "Input graph.")
-.add_argument("inputs", "NDArray-or-Symbol[]",
+.add_argument("data", "NDArray-or-Symbol[]",
               "The input arrays that include data arrays and states.")
 .add_arguments(ForeachParam::__FIELDS__());
 

From 0e666a9151cf9e0a5d0e22602b4c4fadeb9dbc64 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 18 May 2018 06:34:26 +0000
Subject: [PATCH 045/135] address comments.

---
 src/operator/nn/control_flow.cc | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index a454fb32d296..60e7d5be55b6 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -56,26 +56,6 @@ struct ForeachParam : public dmlc::Parameter<ForeachParam> {
 
 DMLC_REGISTER_PARAMETER(ForeachParam);
 
-// The input arguments are ordered in the following order:
-// in, state0, state1, ...
-// We need to reorder them in the same order as the input nodes of the subgraph.
-template<typename T>
-static std::vector<T> ReorderInputs(const std::vector<T> &in, const nnvm::IndexedGraph& idx) {
-  std::vector<T> ret(in.size());
-  CHECK_EQ(idx.input_nodes().size(), in.size());
-  for (size_t i = 0; i < idx.input_nodes().size(); i++) {
-    std::string name = idx[idx.input_nodes()[i]].source->attrs.name;
-    if (name == "in") {
-      ret[i] = in[0];
-    } else {
-      auto idx_str = name.substr(5);
-      int idx = std::stoi(idx_str);
-      ret[i] = in[idx + 1];
-    }
-  }
-  return ret;
-}
-
 class ForeachState {
   // These are output arrays from all iterations.
   // They also contain the Op state for each CachedOp.

From 255c478918a8dbc8697aaa60d9bfe40cd8512de0 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 18 May 2018 16:51:21 +0000
Subject: [PATCH 046/135] move some common functions out.

---
 src/operator/nn/control_flow.cc       | 115 +------------------
 src/operator/nn/subgraph_op_common.cc | 154 ++++++++++++++++++++++++++
 src/operator/nn/subgraph_op_common.h  |  46 ++++++++
 3 files changed, 206 insertions(+), 109 deletions(-)
 create mode 100644 src/operator/nn/subgraph_op_common.cc
 create mode 100644 src/operator/nn/subgraph_op_common.h

diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 60e7d5be55b6..ab3e4118cdb0 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -27,6 +27,7 @@
 #include "../operator_common.h"
 #include "../elemwise_op_common.h"
 #include "../../imperative/imperative_utils.h"
+#include "./subgraph_op_common.h"
 
 namespace mxnet {
 namespace op {
@@ -437,29 +438,8 @@ static bool ForeachType(const nnvm::NodeAttrs& attrs,
                         std::vector<int> *in_type, std::vector<int> *out_type) {
   const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
   CHECK_EQ(out_type->size(), (size_t) params.num_outputs);
-  nnvm::DTypeVector dtype_inputs = *in_type;
   CHECK_EQ(attrs.subgraphs.size(), 1U);
-  auto g = std::make_shared<nnvm::Graph>();
-  g->outputs = attrs.subgraphs[0]->outputs;
-  const auto& idx = g->indexed_graph();
-  CHECK_EQ(idx.input_nodes().size(), in_type->size());
-  CHECK_EQ(idx.outputs().size(), out_type->size());
-  imperative::CheckAndInferType(g.get(), std::move(dtype_inputs), true);
-
-  const auto &dtypes = g->GetAttr<nnvm::DTypeVector>("dtype");
-
-  // Inferring the data type in the subgraph may infer the data type of the inputs.
-  // We need to copy the inferred input data types back.
-  const auto &input_nids = idx.input_nodes();
-  CHECK_EQ(input_nids.size(), in_type->size());
-  for (size_t i = 0; i < in_type->size(); i++) {
-    auto eid = idx.entry_id(input_nids[i], 0);
-    (*in_type)[i] = dtypes[eid];
-  }
-
-  for (size_t i = 0; i < g->outputs.size(); i++)
-    (*out_type)[i] = dtypes[idx.entry_id(g->outputs[i])];
-  return true;
+  return InferSubgraphDataType(*attrs.subgraphs[0], in_type, out_type);
 }
 
 static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
@@ -470,33 +450,8 @@ static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
   const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
   CHECK_EQ(out_attrs->size(), (size_t) params.num_outputs);
   CHECK_EQ(attrs.subgraphs.size(), 1U);
-  auto g = std::make_shared<nnvm::Graph>();
-  g->outputs = attrs.subgraphs[0]->outputs;
-  const auto& idx = g->indexed_graph();
-  CHECK_EQ(idx.input_nodes().size(), in_attrs->size());
-  CHECK_EQ(idx.outputs().size(), out_attrs->size());
-  exec::DevMaskVector dev_masks(idx.num_nodes(), dev_mask);
-  StorageTypeVector storage_type_inputs = *in_attrs;
-  imperative::CheckAndInferStorageType(g.get(), std::move(dev_masks),
-                                       std::move(storage_type_inputs), true);
-
-  const auto& stypes = g->GetAttr<StorageTypeVector>("storage_type");
-
-  // Inferring the storage in the subgraph may infer the storage of the inputs.
-  // We need to copy the inferred input storage back.
-  const auto &input_nids = idx.input_nodes();
-  CHECK_EQ(input_nids.size(), in_attrs->size());
-  for (size_t i = 0; i < in_attrs->size(); i++) {
-    auto eid = idx.entry_id(input_nids[i], 0);
-    (*in_attrs)[i] = stypes[eid];
-  }
-
-  *dispatch_mode = DispatchMode::kFComputeEx;
-  auto &outputs = idx.outputs();
-  CHECK(outputs.size() == out_attrs->size());
-  for (size_t i = 0; i < out_attrs->size(); i++)
-    (*out_attrs)[i] = stypes[idx.entry_id(outputs[i])];
-  return true;
+  return InferSubgraphStorage(*attrs.subgraphs[0], dev_mask,
+                              dispatch_mode, in_attrs, out_attrs);
 }
 
 static bool BackwardForeachStorageType(const nnvm::NodeAttrs& attrs,
@@ -504,69 +459,11 @@ static bool BackwardForeachStorageType(const nnvm::NodeAttrs& attrs,
                                        DispatchMode* dispatch_mode,
                                        std::vector<int> *in_attrs,
                                        std::vector<int> *out_attrs) {
-  using namespace nnvm;
   const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
   CHECK_EQ(out_attrs->size(), (size_t) params.num_args - 1);
   CHECK_EQ(attrs.subgraphs.size(), 1U);
-  // construct backward graph
-  nnvm::Graph grad_graph;
-  nnvm::Graph fwd_graph;
-  std::vector<Node *> potential_nodes;
-  {
-    fwd_graph.outputs = attrs.subgraphs[0]->outputs;
-    std::vector<nnvm::NodeEntry> ograd_entries;
-    ograd_entries.reserve(fwd_graph.outputs.size());
-    for (size_t i = 0; i < fwd_graph.outputs.size(); ++i) {
-      ograd_entries.emplace_back(NodeEntry{Node::Create(), 0, 0});
-    }
-    nnvm::Symbol subgraph_sym = *attrs.subgraphs[0];
-
-    std::vector<NodeEntry> xs;
-    std::vector<NodePtr> args = subgraph_sym.ListInputs(nnvm::Symbol::kReadOnlyArgs);
-    xs.reserve(args.size());
-    for (const auto& i : args)
-      xs.emplace_back(NodeEntry{i, 0, 0});
-    CHECK_GT(xs.size(), 0)
-        << "There are no inputs in computation graph that require gradients.";
-
-    static const std::vector<const Op*> zero_ops{Op::Get("zeros_like"), Op::Get("_zeros")};
-    grad_graph = pass::Gradient(
-        fwd_graph, fwd_graph.outputs, xs, ograd_entries,
-        exec::AggregateGradient, nullptr, nullptr,
-        zero_ops, "_copy");
-    potential_nodes.reserve(fwd_graph.outputs.size() + xs.size() + ograd_entries.size());
-    for (auto e : ograd_entries)
-      potential_nodes.push_back(e.node.get());
-    for (auto e : xs)
-      potential_nodes.push_back(e.node.get());
-    for (auto e : fwd_graph.outputs)
-      potential_nodes.push_back(e.node.get());
-  }
-
-  const auto& idx = grad_graph.indexed_graph();
-  auto input_nodes = idx.input_nodes();
-  StorageTypeVector storage_type_inputs(input_nodes.size());
-  for (size_t i = 0; i < input_nodes.size(); i++) {
-    auto node_id = input_nodes[i];
-    const nnvm::IndexedGraph::Node &n = idx[node_id];
-    auto it = std::find(potential_nodes.begin(), potential_nodes.end(), n.source);
-    CHECK(it != potential_nodes.end());
-    size_t idx = it - potential_nodes.begin();
-    CHECK_LT(idx, in_attrs->size());
-    storage_type_inputs[i] = in_attrs->at(idx);
-  }
-  CHECK_EQ(idx.outputs().size(), out_attrs->size());
-  exec::DevMaskVector dev_masks(idx.num_nodes(), dev_mask);
-  imperative::CheckAndInferStorageType(&grad_graph, std::move(dev_masks),
-                                       std::move(storage_type_inputs), true);
-
-  const auto& stypes = grad_graph.GetAttr<StorageTypeVector>("storage_type");
-  *dispatch_mode = DispatchMode::kFComputeEx;
-  auto &outputs = idx.outputs();
-  CHECK(outputs.size() == out_attrs->size());
-  for (size_t i = 0; i < out_attrs->size(); i++)
-    (*out_attrs)[i] = stypes[idx.entry_id(outputs[i])];
-  return true;
+  return InferSubgraphBackwardStorage(*attrs.subgraphs[0], dev_mask,
+                                      dispatch_mode, in_attrs, out_attrs);
 }
 
 static OpStatePtr CreateForeachState(const NodeAttrs& attrs,
diff --git a/src/operator/nn/subgraph_op_common.cc b/src/operator/nn/subgraph_op_common.cc
new file mode 100644
index 000000000000..7efd48e0eed0
--- /dev/null
+++ b/src/operator/nn/subgraph_op_common.cc
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "./subgraph_op_common.h"
+#include "../operator_common.h"
+#include "../../imperative/imperative_utils.h"
+
+namespace mxnet {
+namespace op {
+
+bool InferSubgraphDataType(nnvm::Symbol &subgraph, std::vector<int> *in_type,
+                           std::vector<int> *out_type) {
+  nnvm::DTypeVector dtype_inputs = *in_type;
+  nnvm::Graph g;
+  g.outputs = subgraph.outputs;
+  const auto& idx = g.indexed_graph();
+  CHECK_EQ(idx.input_nodes().size(), in_type->size());
+  CHECK_EQ(idx.outputs().size(), out_type->size());
+  imperative::CheckAndInferType(&g, std::move(dtype_inputs), true);
+
+  const auto &dtypes = g.GetAttr<nnvm::DTypeVector>("dtype");
+
+  // Inferring the data type in the subgraph may infer the data type of the inputs.
+  // We need to copy the inferred input data types back.
+  const auto &input_nids = idx.input_nodes();
+  CHECK_EQ(input_nids.size(), in_type->size());
+  for (size_t i = 0; i < in_type->size(); i++) {
+    auto eid = idx.entry_id(input_nids[i], 0);
+    (*in_type)[i] = dtypes[eid];
+  }
+
+  for (size_t i = 0; i < g.outputs.size(); i++)
+    (*out_type)[i] = dtypes[idx.entry_id(g.outputs[i])];
+  return true;
+}
+
+bool InferSubgraphStorage(nnvm::Symbol &subgraph,
+                          const int dev_mask,
+                          DispatchMode* dispatch_mode,
+                          std::vector<int> *in_attrs,
+                          std::vector<int> *out_attrs) {
+  nnvm::Graph g;
+  g.outputs = subgraph.outputs;
+  const auto& idx = g.indexed_graph();
+  CHECK_EQ(idx.input_nodes().size(), in_attrs->size());
+  CHECK_EQ(idx.outputs().size(), out_attrs->size());
+  exec::DevMaskVector dev_masks(idx.num_nodes(), dev_mask);
+  StorageTypeVector storage_type_inputs = *in_attrs;
+  imperative::CheckAndInferStorageType(&g, std::move(dev_masks),
+                                       std::move(storage_type_inputs), true);
+
+  const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
+
+  // Inferring the storage in the subgraph may infer the storage of the inputs.
+  // We need to copy the inferred input storage back.
+  const auto &input_nids = idx.input_nodes();
+  CHECK_EQ(input_nids.size(), in_attrs->size());
+  for (size_t i = 0; i < in_attrs->size(); i++) {
+    auto eid = idx.entry_id(input_nids[i], 0);
+    (*in_attrs)[i] = stypes[eid];
+  }
+
+  *dispatch_mode = DispatchMode::kFComputeEx;
+  auto &outputs = idx.outputs();
+  CHECK(outputs.size() == out_attrs->size());
+  for (size_t i = 0; i < out_attrs->size(); i++)
+    (*out_attrs)[i] = stypes[idx.entry_id(outputs[i])];
+  return true;
+}
+
+bool InferSubgraphBackwardStorage(nnvm::Symbol &subgraph,
+                                  const int dev_mask,
+                                  DispatchMode* dispatch_mode,
+                                  std::vector<int> *in_attrs,
+                                  std::vector<int> *out_attrs) {
+  using namespace nnvm;
+  // construct backward graph
+  nnvm::Graph grad_graph;
+  nnvm::Graph fwd_graph;
+  std::vector<Node *> potential_nodes;
+  {
+    fwd_graph.outputs = subgraph.outputs;
+    std::vector<nnvm::NodeEntry> ograd_entries;
+    ograd_entries.reserve(fwd_graph.outputs.size());
+    for (size_t i = 0; i < fwd_graph.outputs.size(); ++i) {
+      ograd_entries.emplace_back(NodeEntry{Node::Create(), 0, 0});
+    }
+
+    std::vector<NodeEntry> xs;
+    std::vector<NodePtr> args = subgraph.ListInputs(nnvm::Symbol::kReadOnlyArgs);
+    xs.reserve(args.size());
+    for (const auto& i : args)
+      xs.emplace_back(NodeEntry{i, 0, 0});
+    CHECK_GT(xs.size(), 0)
+        << "There are no inputs in computation graph that require gradients.";
+
+    static const std::vector<const Op*> zero_ops{Op::Get("zeros_like"), Op::Get("_zeros")};
+    grad_graph = pass::Gradient(
+        fwd_graph, fwd_graph.outputs, xs, ograd_entries,
+        exec::AggregateGradient, nullptr, nullptr,
+        zero_ops, "_copy");
+    potential_nodes.reserve(fwd_graph.outputs.size() + xs.size() + ograd_entries.size());
+    for (auto e : ograd_entries)
+      potential_nodes.push_back(e.node.get());
+    for (auto e : xs)
+      potential_nodes.push_back(e.node.get());
+    for (auto e : fwd_graph.outputs)
+      potential_nodes.push_back(e.node.get());
+  }
+
+  const auto& idx = grad_graph.indexed_graph();
+  auto input_nodes = idx.input_nodes();
+  StorageTypeVector storage_type_inputs(input_nodes.size());
+  for (size_t i = 0; i < input_nodes.size(); i++) {
+    auto node_id = input_nodes[i];
+    const nnvm::IndexedGraph::Node &n = idx[node_id];
+    auto it = std::find(potential_nodes.begin(), potential_nodes.end(), n.source);
+    CHECK(it != potential_nodes.end());
+    size_t idx = it - potential_nodes.begin();
+    CHECK_LT(idx, in_attrs->size());
+    storage_type_inputs[i] = in_attrs->at(idx);
+  }
+  CHECK_EQ(idx.outputs().size(), out_attrs->size());
+  exec::DevMaskVector dev_masks(idx.num_nodes(), dev_mask);
+  imperative::CheckAndInferStorageType(&grad_graph, std::move(dev_masks),
+                                       std::move(storage_type_inputs), true);
+
+  const auto& stypes = grad_graph.GetAttr<StorageTypeVector>("storage_type");
+  *dispatch_mode = DispatchMode::kFComputeEx;
+  auto &outputs = idx.outputs();
+  CHECK(outputs.size() == out_attrs->size());
+  for (size_t i = 0; i < out_attrs->size(); i++)
+    (*out_attrs)[i] = stypes[idx.entry_id(outputs[i])];
+  return true;
+}
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/nn/subgraph_op_common.h b/src/operator/nn/subgraph_op_common.h
new file mode 100644
index 000000000000..6412bb45e8a3
--- /dev/null
+++ b/src/operator/nn/subgraph_op_common.h
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_OPERATOR_NN_SUBGRAPH_OP_COMMON_H_
+#define MXNET_OPERATOR_NN_SUBGRAPH_OP_COMMON_H_
+
+#include <mxnet/io.h>
+#include <mxnet/base.h>
+#include <mxnet/op_attr_types.h>
+
+namespace mxnet {
+namespace op {
+
+bool InferSubgraphDataType(nnvm::Symbol &subgraph, std::vector<int> *in_type,
+                           std::vector<int> *out_type);
+bool InferSubgraphStorage(nnvm::Symbol &subgraph,
+                          const int dev_mask,
+                          DispatchMode* dispatch_mode,
+                          std::vector<int> *in_attrs,
+                          std::vector<int> *out_attrs);
+bool InferSubgraphBackwardStorage(nnvm::Symbol &subgraph,
+                                  const int dev_mask,
+                                  DispatchMode* dispatch_mode,
+                                  std::vector<int> *in_attrs,
+                                  std::vector<int> *out_attrs);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NN_SUBGRAPH_OP_COMMON_H_

From 2beb3f3813053597d0f6b1d9b316ea3fe02c189e Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 18 May 2018 17:47:02 +0000
Subject: [PATCH 047/135] address comments.

---
 src/operator/nn/control_flow.cc       | 26 +++++++++++++-------------
 src/operator/nn/subgraph_op_common.cc | 14 +++++++-------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index ab3e4118cdb0..8e242e950aff 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -391,14 +391,14 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
     shape_inputs[loc] = TShape(in_shape->at(loc).begin() + 1, in_shape->at(loc).end());
   }
   CHECK_EQ(attrs.subgraphs.size(), 1U);
-  auto g = std::make_shared<nnvm::Graph>();
-  g->outputs = attrs.subgraphs[0]->outputs;
-  const auto& idx = g->indexed_graph();
+  nnvm::Graph g;
+  g.outputs = attrs.subgraphs[0]->outputs;
+  const auto& idx = g.indexed_graph();
   CHECK_EQ(idx.input_nodes().size(), in_shape->size());
   CHECK_EQ(idx.outputs().size(), out_shape->size());
-  imperative::CheckAndInferShape(g.get(), std::move(shape_inputs), true);
-  const auto& shapes = g->GetAttr<nnvm::ShapeVector>("shape");
+  imperative::CheckAndInferShape(&g, std::move(shape_inputs), true);
 
+  const auto& shapes = g.GetAttr<nnvm::ShapeVector>("shape");
   // Inferring the shape in the subgraph may infer the shape of the inputs.
   // We need to copy the inferred input shapes back.
   const auto &input_nids = idx.input_nodes();
@@ -407,26 +407,26 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
     auto eid = idx.entry_id(input_nids[i], 0);
     // If the input shape is none, we should update them.
     if ((*in_shape)[i].ndim() == 0 || (*in_shape)[i].Size() == 0)
-      (*in_shape)[i] = shapes[eid];
+      SHAPE_ASSIGN_CHECK(*in_shape, i, shapes[eid]);
   }
 
   // For the shape of output data.
   for (int i = 0; i < params.num_out_data; i++) {
-    uint32_t eid = idx.entry_id(g->outputs[i]);
+    uint32_t eid = idx.entry_id(g.outputs[i]);
     const auto& g_out_shape = shapes[eid];
-    auto &out = (*out_shape)[i];
-    out = TShape(g_out_shape.ndim() + 1);
+    auto out = TShape(g_out_shape.ndim() + 1);
     out[0] = len;
     for (size_t i = 1; i < out.ndim(); i++)
       out[i] = g_out_shape[i - 1];
+    SHAPE_ASSIGN_CHECK(*out_shape, i, out);
   }
 
   // For the remaining shapes.
-  for (size_t i = params.num_out_data; i < g->outputs.size(); i++) {
-    uint32_t eid = idx.entry_id(g->outputs[i]);
-    (*out_shape)[i] = shapes[eid];
+  for (size_t i = params.num_out_data; i < g.outputs.size(); i++) {
+    uint32_t eid = idx.entry_id(g.outputs[i]);
+    SHAPE_ASSIGN_CHECK(*out_shape, i, shapes[eid]);
   }
-  size_t num_states = g->outputs.size() - params.num_out_data;
+  size_t num_states = g.outputs.size() - params.num_out_data;
   for (size_t i = 0; i < num_states; i++) {
     size_t loc = params.in_state_locs[i];
     CHECK((*out_shape)[i + params.num_out_data] == (*in_shape)[loc]);
diff --git a/src/operator/nn/subgraph_op_common.cc b/src/operator/nn/subgraph_op_common.cc
index 7efd48e0eed0..8221dbb7dfa6 100644
--- a/src/operator/nn/subgraph_op_common.cc
+++ b/src/operator/nn/subgraph_op_common.cc
@@ -42,11 +42,11 @@ bool InferSubgraphDataType(nnvm::Symbol &subgraph, std::vector<int> *in_type,
   CHECK_EQ(input_nids.size(), in_type->size());
   for (size_t i = 0; i < in_type->size(); i++) {
     auto eid = idx.entry_id(input_nids[i], 0);
-    (*in_type)[i] = dtypes[eid];
+    TYPE_ASSIGN_CHECK(*in_type, i, dtypes[eid]);
   }
 
   for (size_t i = 0; i < g.outputs.size(); i++)
-    (*out_type)[i] = dtypes[idx.entry_id(g.outputs[i])];
+    TYPE_ASSIGN_CHECK(*out_type, i, dtypes[idx.entry_id(g.outputs[i])]);
   return true;
 }
 
@@ -73,14 +73,14 @@ bool InferSubgraphStorage(nnvm::Symbol &subgraph,
   CHECK_EQ(input_nids.size(), in_attrs->size());
   for (size_t i = 0; i < in_attrs->size(); i++) {
     auto eid = idx.entry_id(input_nids[i], 0);
-    (*in_attrs)[i] = stypes[eid];
+    STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, i, stypes[eid]);
   }
 
-  *dispatch_mode = DispatchMode::kFComputeEx;
+  DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
   auto &outputs = idx.outputs();
   CHECK(outputs.size() == out_attrs->size());
   for (size_t i = 0; i < out_attrs->size(); i++)
-    (*out_attrs)[i] = stypes[idx.entry_id(outputs[i])];
+    STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, i, stypes[idx.entry_id(outputs[i])]);
   return true;
 }
 
@@ -142,11 +142,11 @@ bool InferSubgraphBackwardStorage(nnvm::Symbol &subgraph,
                                        std::move(storage_type_inputs), true);
 
   const auto& stypes = grad_graph.GetAttr<StorageTypeVector>("storage_type");
-  *dispatch_mode = DispatchMode::kFComputeEx;
+  DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
   auto &outputs = idx.outputs();
   CHECK(outputs.size() == out_attrs->size());
   for (size_t i = 0; i < out_attrs->size(); i++)
-    (*out_attrs)[i] = stypes[idx.entry_id(outputs[i])];
+    STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, i, stypes[idx.entry_id(outputs[i])]);
   return true;
 }
 

From 716bc6a40b536e98248892d5f04d5abaaa8d12a3 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 18 May 2018 22:27:54 +0000
Subject: [PATCH 048/135] fix lint.

---
 src/operator/nn/control_flow.cc       | 11 +++--------
 src/operator/nn/subgraph_op_common.cc |  7 ++++---
 src/operator/nn/subgraph_op_common.h  |  6 +++---
 3 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 8e242e950aff..caf77bda6029 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -90,9 +90,9 @@ class ForeachState {
   }
 };
 
-void ForeachState::Forward(std::vector<NDArray> cinputs,
+void ForeachState::Forward(const std::vector<NDArray> &cinputs,
                            const std::vector<OpReqType>& req,
-                           std::vector<NDArray> coutputs, bool is_recording) {
+                           const std::vector<NDArray> &coutputs, bool is_recording) {
   using namespace nnvm;
   using namespace imperative;
 
@@ -136,8 +136,6 @@ void ForeachState::Forward(std::vector<NDArray> cinputs,
   std::unordered_map<std::string, std::vector<NDArray> > params;
   CachedOpPtr op = std::make_shared<Imperative::CachedOp>(subgraph_sym, kwargs,
                                                           arg_names, params);
-  // TODO here we only changed the output arrays in the arguments.
-  // Will this be a problem?
   // TODO(zhengda) we need to avoid shape inference and memory plan whenever the op is
   // called. Currently, CachedOp allocates memory each time Forward is called.
   // I need to fix this once the PR for static memory allocation in CachedOp is
@@ -145,7 +143,6 @@ void ForeachState::Forward(std::vector<NDArray> cinputs,
   op->Forward(nullptr, inputs, outputs);
 
   if (is_recording) {
-    // TODO does this have right inputs and outputs?
     all_outputs.push_back(coutputs);
     iter_ops.push_back(op);
   }
@@ -186,8 +183,6 @@ void ForeachState::Backward(int iter_no, std::vector<NDArray> ograds,
     outputs.push_back(&igrads[i]);
   CHECK_EQ(outputs.size(), op->num_inputs());
 
-  // TODO here we only changed the output arrays in the arguments.
-  // Will this be a problem?
   CHECK(!Imperative::AGInfo::IsNone(all_outputs[iter_no][0]));
   const nnvm::NodeEntry &node_entry = all_outputs[iter_no][0].GetAutogradEntry();
   OpStatePtr state = Imperative::AGInfo::Get(node_entry.node).state;
@@ -255,7 +250,7 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
     std::vector<NDArray> *subg_out_prev = subg_outputs[(i + 1) % 2];
     for (int j = 0; j < params.num_out_data; j++)
       (*subg_out_curr)[j] = outputs[j].At(i);
-    // When recording for backward computation, we should make sure 
+    // When recording for backward computation, we should make sure
     // that output arrays are actually different in each iteration.
     if (ctx.need_grad && i < len - 1) {
       for (size_t j = params.num_out_data; j < subg_out_curr->size(); j++)
diff --git a/src/operator/nn/subgraph_op_common.cc b/src/operator/nn/subgraph_op_common.cc
index 8221dbb7dfa6..ac2218b062fe 100644
--- a/src/operator/nn/subgraph_op_common.cc
+++ b/src/operator/nn/subgraph_op_common.cc
@@ -24,7 +24,8 @@
 namespace mxnet {
 namespace op {
 
-bool InferSubgraphDataType(nnvm::Symbol &subgraph, std::vector<int> *in_type,
+bool InferSubgraphDataType(const nnvm::Symbol &subgraph,
+                           std::vector<int> *in_type,
                            std::vector<int> *out_type) {
   nnvm::DTypeVector dtype_inputs = *in_type;
   nnvm::Graph g;
@@ -50,7 +51,7 @@ bool InferSubgraphDataType(nnvm::Symbol &subgraph, std::vector<int> *in_type,
   return true;
 }
 
-bool InferSubgraphStorage(nnvm::Symbol &subgraph,
+bool InferSubgraphStorage(const nnvm::Symbol &subgraph,
                           const int dev_mask,
                           DispatchMode* dispatch_mode,
                           std::vector<int> *in_attrs,
@@ -84,7 +85,7 @@ bool InferSubgraphStorage(nnvm::Symbol &subgraph,
   return true;
 }
 
-bool InferSubgraphBackwardStorage(nnvm::Symbol &subgraph,
+bool InferSubgraphBackwardStorage(const nnvm::Symbol &subgraph,
                                   const int dev_mask,
                                   DispatchMode* dispatch_mode,
                                   std::vector<int> *in_attrs,
diff --git a/src/operator/nn/subgraph_op_common.h b/src/operator/nn/subgraph_op_common.h
index 6412bb45e8a3..1b6587953c78 100644
--- a/src/operator/nn/subgraph_op_common.h
+++ b/src/operator/nn/subgraph_op_common.h
@@ -27,14 +27,14 @@
 namespace mxnet {
 namespace op {
 
-bool InferSubgraphDataType(nnvm::Symbol &subgraph, std::vector<int> *in_type,
+bool InferSubgraphDataType(const nnvm::Symbol &subgraph, std::vector<int> *in_type,
                            std::vector<int> *out_type);
-bool InferSubgraphStorage(nnvm::Symbol &subgraph,
+bool InferSubgraphStorage(const nnvm::Symbol &subgraph,
                           const int dev_mask,
                           DispatchMode* dispatch_mode,
                           std::vector<int> *in_attrs,
                           std::vector<int> *out_attrs);
-bool InferSubgraphBackwardStorage(nnvm::Symbol &subgraph,
+bool InferSubgraphBackwardStorage(const nnvm::Symbol &subgraph,
                                   const int dev_mask,
                                   DispatchMode* dispatch_mode,
                                   std::vector<int> *in_attrs,

From dd5f86202afe01bb48da9efda41f335a8f84d17f Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 18 May 2018 22:33:23 +0000
Subject: [PATCH 049/135] Fix lint.

---
 src/operator/nn/subgraph_op_common.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/operator/nn/subgraph_op_common.h b/src/operator/nn/subgraph_op_common.h
index 1b6587953c78..2025a2baefa2 100644
--- a/src/operator/nn/subgraph_op_common.h
+++ b/src/operator/nn/subgraph_op_common.h
@@ -23,17 +23,32 @@
 #include <mxnet/io.h>
 #include <mxnet/base.h>
 #include <mxnet/op_attr_types.h>
+#include <vector>
 
 namespace mxnet {
 namespace op {
 
+/*
+ * Infer the data types of inputs and outputs of an operator that contains a
+ * subgraph.
+ */
 bool InferSubgraphDataType(const nnvm::Symbol &subgraph, std::vector<int> *in_type,
                            std::vector<int> *out_type);
+
+/*
+ * Infer the storage types of inputs and outputs of an operator that contains a
+ * subgraph.
+ */
 bool InferSubgraphStorage(const nnvm::Symbol &subgraph,
                           const int dev_mask,
                           DispatchMode* dispatch_mode,
                           std::vector<int> *in_attrs,
                           std::vector<int> *out_attrs);
+
+/*
+ * Infer the storage types of inputs and outputs of the backward computation of
+ * an operator that contains a subgraph.
+ */
 bool InferSubgraphBackwardStorage(const nnvm::Symbol &subgraph,
                                   const int dev_mask,
                                   DispatchMode* dispatch_mode,

From b60157ab27a0c4976baf180445e92eedb3b05e7c Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sat, 19 May 2018 00:38:02 +0000
Subject: [PATCH 050/135] add doc.

---
 python/mxnet/symbol/contrib.py  | 65 ++++++++++++++++++++++++++++++---
 src/operator/nn/control_flow.cc |  2 +-
 2 files changed, 61 insertions(+), 6 deletions(-)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index c236dffa63cd..bc5cd08c4583 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -99,11 +99,10 @@ def rand_zipfian(true_classes, num_sampled, range_max):
     expected_count_sampled = expected_prob_sampled * num_sampled
     return sampled_classes, expected_count_true, expected_count_sampled
 
-def _get_graph_inputs(subg, name, prefix):
+def _get_graph_inputs(subg):
     num_handles = ctypes.c_int(1000)
     handles = c_array(SymbolHandle, [SymbolHandle(0) for i in range(1000)])
-    check_call(_LIB.MXSymbolGetInputSymbols(subg.handle, handles,
-        ctypes.byref(num_handles)))
+    check_call(_LIB.MXSymbolGetInputSymbols(subg.handle, handles, ctypes.byref(num_handles)))
 
     syms = []
     for i in range(num_handles.value):
@@ -111,7 +110,63 @@ def _get_graph_inputs(subg, name, prefix):
         syms.append(s)
     return syms
 
-def foreach(func, data, init_states, back_prop=False, name="foreach"):
+def foreach(func, data, init_states, name="foreach"):
+    """Run a for loop with user-defined computation over NDArrays on dimension 0.
+
+    This operator simulates a for loop and func has the computation for an iteration
+    of the for loop. It runs the computation in func on each slice from the input
+    NDArrays.
+
+    func takes two arguments as input and outputs a tuple of two elements,
+    as illustrated below:
+
+    out, states = func(data1, states)
+
+    data1 can be either a symbol or a list of symbols. If data is a symbol,
+    data1 is a symbol. Otherwise, data1 is a list of symbols and has the same
+    size as data. states is a list of symbols and have the same size as init_states.
+    Similarly, out can be either a symbol or a list of symbols, which are concatenated
+    as the first output of foreach; states from the last execution of func
+    are the second output of foreach.
+
+    The computation done by this operator is equivalent to the pseudo code below
+    when the input data is NDArray:
+
+    states = init_states
+    outs = []
+    for i in data.shape[0]:
+        s = data[i]
+        out, states = func(s, states)
+        outs.append(out)
+    outs = stack(*outs)
+
+
+    Parameters
+    ----------
+    func : a Python function.
+        Define computation in an iteration.
+    data: a symbol or a list of symbols.
+        The input data.
+    init_states: a list of symbols.
+        The initial values of the loop states.
+    name: string.
+        The name of the operator.
+
+    Returns
+    -------
+    outputs: a Symbol or a list of Symbols.
+        The output data concatenated from the output of all iterations.
+    states: a list of Symbols.
+        The loop states in the last iteration.
+
+    Examples
+    --------
+    >>> step = lambda data, states: (data + states[0], [states[0] * 2])
+    >>> data = mx.sym.var('data')
+    >>> states = [mx.sym.var('state')]
+    >>> outs, states = mx.sym.contrib.foreach(step, data, states)
+    """
+
     assert isinstance(init_states, list), "init_states should be a list"
     states = []
 
@@ -148,7 +203,7 @@ def foreach(func, data, init_states, back_prop=False, name="foreach"):
             # all symbols will refer to different NDArrays.
             flat_out.append(symbol.op.identity(s))
     g = symbol.Group(flat_out)
-    input_syms = _get_graph_inputs(g, name, "ro_var")
+    input_syms = _get_graph_inputs(g)
 
     if isinstance(data, list):
         num_inputs = len(data)
diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index caf77bda6029..3e8e733f572f 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -478,7 +478,7 @@ ForeachGradient(const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ogra
 }
 
 NNVM_REGISTER_OP(_foreach)
-.describe(R"code(foreach)code" ADD_FILELINE)
+.MXNET_DESCRIBE("Run a for loop over an NDArray with user-defined computation")
 .set_attr_parser(ParamParser<ForeachParam>)
 .set_attr<FInferStorageType>("FInferStorageType", ForeachStorageType)
 .set_num_inputs([](const NodeAttrs& attrs) {

From 57b2ba5e94cd9734c6498b4313585489d50e8528 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sat, 19 May 2018 00:39:39 +0000
Subject: [PATCH 051/135] undo modification in imperative.h

---
 include/mxnet/imperative.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/mxnet/imperative.h b/include/mxnet/imperative.h
index d50c1c371428..7ea60df33028 100644
--- a/include/mxnet/imperative.h
+++ b/include/mxnet/imperative.h
@@ -105,18 +105,18 @@ class Imperative {
                 std::vector<bool>* p_save_inputs = nullptr,
                 std::vector<bool>* p_save_outputs = nullptr);
   /*! \brief */
-  static OpStatePtr Invoke(const Context& default_ctx,
-                           const nnvm::NodeAttrs& attrs,
-                           const std::vector<NDArray*>& inputs,
-                           const std::vector<NDArray*>& outputs);
+  OpStatePtr Invoke(const Context& default_ctx,
+                    const nnvm::NodeAttrs& attrs,
+                    const std::vector<NDArray*>& inputs,
+                    const std::vector<NDArray*>& outputs);
   /*! \brief */
-  static OpStatePtr InvokeOp(const Context& ctx,
-                             const nnvm::NodeAttrs& attrs,
-                             const std::vector<NDArray*>& inputs,
-                             const std::vector<NDArray*>& outputs,
-                             const std::vector<OpReqType>& req,
-                             const DispatchMode dispatch_mode,
-                             OpStatePtr state = OpStatePtr());
+  OpStatePtr InvokeOp(const Context& ctx,
+                      const nnvm::NodeAttrs& attrs,
+                      const std::vector<NDArray*>& inputs,
+                      const std::vector<NDArray*>& outputs,
+                      const std::vector<OpReqType>& req,
+                      const DispatchMode dispatch_mode,
+                      OpStatePtr state = OpStatePtr());
   /*! \brief mark variables for computing gradients. */
   void MarkVariables(const std::vector<NDArray*>& variables,
                      const std::vector<mx_uint>& grad_reqs,

From 7c49057ff76c42c27b3d0b1913b723e218879f0a Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sat, 19 May 2018 00:48:57 +0000
Subject: [PATCH 052/135] add doc and remove example code.

---
 python/mxnet/gluon/contrib/rnn/rnn_cell.py | 146 +--------------------
 python/mxnet/ndarray/contrib.py            |  62 ++++++++-
 2 files changed, 60 insertions(+), 148 deletions(-)

diff --git a/python/mxnet/gluon/contrib/rnn/rnn_cell.py b/python/mxnet/gluon/contrib/rnn/rnn_cell.py
index dcb396a57613..1b9afee14bf2 100644
--- a/python/mxnet/gluon/contrib/rnn/rnn_cell.py
+++ b/python/mxnet/gluon/contrib/rnn/rnn_cell.py
@@ -17,15 +17,10 @@
 
 # coding: utf-8
 """Definition of various recurrent neural network cells."""
-__all__ = ['VariationalDropoutCell', 'LSTMPCell', 'SymHybridRNNCell', 'RNNCell']
+__all__ = ['VariationalDropoutCell', 'LSTMPCell']
 
-import inspect
-
-from .... import symbol, ndarray
-from ....base import _as_list
 from ...rnn import BidirectionalCell, SequentialRNNCell, ModifierCell, HybridRecurrentCell
 from ...rnn.rnn_cell import _format_sequence, _get_begin_state, _mask_sequence_variable_length
-from ...rnn.rnn_cell import RNNCell as GluonRNNCell
 from ... import tensor_types
 
 class VariationalDropoutCell(ModifierCell):
@@ -320,142 +315,3 @@ def hybrid_forward(self, F, inputs, states, i2h_weight,
 
         return next_r, [next_r, next_c]
     # pylint: enable= arguments-differ
-
-class SymHybridRNNCell(HybridRecurrentCell):
-    def __init__(self, prefix=None, params=None):
-        super(SymHybridRNNCell, self).__init__(prefix=prefix, params=params)
-
-    def unroll(self, inputs, begin_state=None, layout='TNC',
-               merge_outputs=None, valid_length=None):
-        # if this is a list, we can have unroll in the parent class to handle it.
-        if (isinstance(inputs, list)):
-            return super(SymHybridRNNCell, self).unroll(self, len(inputs), inputs, begin_state,
-                                                        layout, merge_outputs, valid_length)
-
-        self.reset()
-        batch_axis = layout.find('N')
-        axis = layout.find('T')
-        batch_size = 0
-        if isinstance(inputs, symbol.Symbol):
-            F = symbol
-        else:
-            batch_size = inputs.shape[batch_axis]
-            F = ndarray
-        begin_state = _get_begin_state(self, F, begin_state, inputs, batch_size)
-
-        states = begin_state
-        outputs = []
-        all_states = []
-        def iter_func(input, states):
-            return self(input, states)
-        outputs, last_states = F.contrib.foreach(iter_func, inputs, begin_state)
-        #if valid_length is not None:
-        #    states = [F.SequenceLast(ele_list,
-        #                             sequence_length=valid_length,
-        #                             use_sequence_length=True,
-        #                             axis=0)
-        #              for ele_list in all_states]
-        #    outputs = F.SequenceMask(outputs, sequence_length=valid_length, use_sequence_length=True,
-        #                             axis=axis)
-        #outputs, _, _, _ = _format_sequence(length, outputs, layout, merge_outputs)
-
-        return outputs, last_states
-
-class RNNCell(SymHybridRNNCell):
-    r"""Elman RNN recurrent neural network cell.
-
-    Each call computes the following function:
-
-    .. math::
-
-        h_t = \tanh(w_{ih} * x_t + b_{ih}  +  w_{hh} * h_{(t-1)} + b_{hh})
-
-    where :math:`h_t` is the hidden state at time `t`, and :math:`x_t` is the hidden
-    state of the previous layer at time `t` or :math:`input_t` for the first layer.
-    If nonlinearity='relu', then `ReLU` is used instead of `tanh`.
-
-    Parameters
-    ----------
-    hidden_size : int
-        Number of units in output symbol
-    activation : str or Symbol, default 'tanh'
-        Type of activation function.
-    i2h_weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    h2h_weight_initializer : str or Initializer
-        Initializer for the recurrent weights matrix, used for the linear
-        transformation of the recurrent state.
-    i2h_bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    h2h_bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str, default 'rnn_'
-        Prefix for name of `Block`s
-        (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells.
-        Created if `None`.
-
-
-    Inputs:
-        - **data**: input tensor with shape `(batch_size, input_size)`.
-        - **states**: a list of one initial recurrent state tensor with shape
-          `(batch_size, num_hidden)`.
-
-    Outputs:
-        - **out**: output tensor with shape `(batch_size, num_hidden)`.
-        - **next_states**: a list of one output recurrent state tensor with the
-          same shape as `states`.
-    """
-    def __init__(self, hidden_size, activation='tanh',
-                 i2h_weight_initializer=None, h2h_weight_initializer=None,
-                 i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
-                 input_size=0, prefix=None, params=None):
-        super(RNNCell, self).__init__(prefix=prefix, params=params)
-        self._hidden_size = hidden_size
-        self._activation = activation
-        self._input_size = input_size
-        self.i2h_weight = self.params.get('i2h_weight', shape=(hidden_size, input_size),
-                                          init=i2h_weight_initializer,
-                                          allow_deferred_init=True)
-        self.h2h_weight = self.params.get('h2h_weight', shape=(hidden_size, hidden_size),
-                                          init=h2h_weight_initializer,
-                                          allow_deferred_init=True)
-        self.i2h_bias = self.params.get('i2h_bias', shape=(hidden_size,),
-                                        init=i2h_bias_initializer,
-                                        allow_deferred_init=True)
-        self.h2h_bias = self.params.get('h2h_bias', shape=(hidden_size,),
-                                        init=h2h_bias_initializer,
-                                        allow_deferred_init=True)
-
-    def state_info(self, batch_size=0):
-        return [{'shape': (batch_size, self._hidden_size), '__layout__': 'NC'}]
-
-    def _alias(self):
-        return 'rnn'
-
-    def __repr__(self):
-        s = '{name}({mapping}'
-        if hasattr(self, '_activation'):
-            s += ', {_activation}'
-        s += ')'
-        shape = self.i2h_weight.shape
-        mapping = '{0} -> {1}'.format(shape[1] if shape[1] else None, shape[0])
-        return s.format(name=self.__class__.__name__,
-                        mapping=mapping,
-                        **self.__dict__)
-
-    def hybrid_forward(self, F, inputs, states, i2h_weight,
-                       h2h_weight, i2h_bias, h2h_bias):
-        prefix = 't%d_'%self._counter
-        i2h = F.FullyConnected(data=inputs, weight=i2h_weight, bias=i2h_bias,
-                               num_hidden=self._hidden_size,
-                               name=prefix+'i2h')
-        h2h = F.FullyConnected(data=states[0], weight=h2h_weight, bias=h2h_bias,
-                               num_hidden=self._hidden_size,
-                               name=prefix+'h2h')
-        output = self._get_activation(F, i2h + h2h, self._activation,
-                                      name=prefix+'out')
-
-        return output, [output]
diff --git a/python/mxnet/ndarray/contrib.py b/python/mxnet/ndarray/contrib.py
index 0697f5ac2bbd..bd0efc9a873e 100644
--- a/python/mxnet/ndarray/contrib.py
+++ b/python/mxnet/ndarray/contrib.py
@@ -98,12 +98,68 @@ def rand_zipfian(true_classes, num_sampled, range_max, ctx=None):
     return sampled_classes, expected_count_true, expected_count_sampled
 # pylint: enable=line-too-long
 
-def foreach(func, input, init_states, back_prop=False, name="foreach"):
+def foreach(func, data, init_states, name="foreach"):
+    """Run a for loop with user-defined computation over NDArrays on dimension 0.
+
+    This operator simulates a for loop and func has the computation for an iteration
+    of the for loop. It runs the computation in func on each slice from the input
+    NDArrays.
+
+    func takes two arguments as input and outputs a tuple of two elements,
+    as illustrated below:
+
+    out, states = func(data1, states)
+
+    data1 can be either a symbol or a list of symbols. If data is a symbol,
+    data1 is a symbol. Otherwise, data1 is a list of symbols and has the same
+    size as data. states is a list of symbols and have the same size as init_states.
+    Similarly, out can be either a symbol or a list of symbols, which are concatenated
+    as the first output of foreach; states from the last execution of func
+    are the second output of foreach.
+
+    The computation done by this operator is equivalent to the pseudo code below
+    when the input data is NDArray:
+
+    states = init_states
+    outs = []
+    for i in data.shape[0]:
+        s = data[i]
+        out, states = func(s, states)
+        outs.append(out)
+    outs = stack(*outs)
+
+
+    Parameters
+    ----------
+    func : a Python function.
+        Define computation in an iteration.
+    data: a symbol or a list of symbols.
+        The input data.
+    init_states: a list of symbols.
+        The initial values of the loop states.
+    name: string.
+        The name of the operator.
+
+    Returns
+    -------
+    outputs: a Symbol or a list of Symbols.
+        The output data concatenated from the output of all iterations.
+    states: a list of Symbols.
+        The loop states in the last iteration.
+
+    Examples
+    --------
+    >>> step = lambda data, states: (data + states[0], [states[0] * 2])
+    >>> data = mx.nd.random.uniform(shape=(2, 10))
+    >>> states = [mx.nd.random.uniform(shape=(10))]
+    >>> outs, states = mx.nd.contrib.foreach(step, data, states)
+    """
+
     assert isinstance(init_states, list), "init_states should be a list"
     states = init_states
     outputs = []
-    for i in range(input.shape[0]):
-        ele = input[i]
+    for i in range(data.shape[0]):
+        ele = data[i]
         outs, states = func(ele, states)
         outs = _as_list(outs)
         if (i == 0):

From 104590879d2d6deb77030d46629f6df4768c8d4f Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sat, 19 May 2018 00:59:44 +0000
Subject: [PATCH 053/135] fix lint.

---
 python/mxnet/symbol/contrib.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index bc5cd08c4583..8840f06a29e5 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -29,8 +29,8 @@
 import ctypes
 
 from . import symbol
-from ..base import _LIB, c_str, c_array, check_call
-from ..base import SymbolHandle, NDArrayHandle, _as_list
+from ..base import _LIB, c_array, check_call
+from ..base import SymbolHandle, _as_list
 from ..attribute import AttrScope
 
 __all__ = ["rand_zipfian"]
@@ -205,11 +205,6 @@ def foreach(func, data, init_states, name="foreach"):
     g = symbol.Group(flat_out)
     input_syms = _get_graph_inputs(g)
 
-    if isinstance(data, list):
-        num_inputs = len(data)
-    else:
-        num_inputs = 1
-
     # Here we need to find out how the input symbols are ordered as well as
     # where the loop states are located in the list of inputs.
 

From e8ec3aaf20e45ff3d44009d9c93ab7bda5674458 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sat, 19 May 2018 01:01:55 +0000
Subject: [PATCH 054/135] fix lint.

---
 python/mxnet/ndarray/contrib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mxnet/ndarray/contrib.py b/python/mxnet/ndarray/contrib.py
index bd0efc9a873e..45a7d14960fa 100644
--- a/python/mxnet/ndarray/contrib.py
+++ b/python/mxnet/ndarray/contrib.py
@@ -98,7 +98,7 @@ def rand_zipfian(true_classes, num_sampled, range_max, ctx=None):
     return sampled_classes, expected_count_true, expected_count_sampled
 # pylint: enable=line-too-long
 
-def foreach(func, data, init_states, name="foreach"):
+def foreach(func, data, init_states):
     """Run a for loop with user-defined computation over NDArrays on dimension 0.
 
     This operator simulates a for loop and func has the computation for an iteration

From e4f58082e2e401160811e44e2237dcf91eef4104 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sat, 19 May 2018 01:17:25 +0000
Subject: [PATCH 055/135] Fix lint.

---
 python/mxnet/ndarray/contrib.py | 14 +++++++-------
 python/mxnet/symbol/contrib.py  |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/mxnet/ndarray/contrib.py b/python/mxnet/ndarray/contrib.py
index 45a7d14960fa..9d7925c0ed9c 100644
--- a/python/mxnet/ndarray/contrib.py
+++ b/python/mxnet/ndarray/contrib.py
@@ -162,13 +162,13 @@ def foreach(func, data, init_states):
         ele = data[i]
         outs, states = func(ele, states)
         outs = _as_list(outs)
-        if (i == 0):
+        if i == 0:
             # outputs is a list of lists
-            for j in range(len(outs)):
-                outputs.append([outs[j]])
+            for out in outs:
+                outputs.append([out])
         else:
-            for j in range(len(outs)):
-                outputs[j].append(outs[j])
-    for i in range(len(outputs)):
-        outputs[i] = stack(*outputs[i])
+            for j, out in enumerate(outs):
+                outputs[j].append(out)
+    for out in outputs:
+        out = stack(*out)
     return (outputs, states)
diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 8840f06a29e5..b1e448b4352a 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -19,6 +19,8 @@
 # pylint: disable=wildcard-import, unused-wildcard-import
 """Contrib Symbol API of MXNet."""
 import math
+import ctypes
+
 from .random import uniform
 from .symbol import Symbol
 try:
@@ -26,8 +28,6 @@
 except ImportError:
     pass
 
-import ctypes
-
 from . import symbol
 from ..base import _LIB, c_array, check_call
 from ..base import SymbolHandle, _as_list

From c078cbfdd18e7bc763fe753509b73f7dccf3298e Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Mon, 21 May 2018 20:20:07 +0000
Subject: [PATCH 056/135] make nd.foreach and sym.foreach consistent.

---
 python/mxnet/ndarray/contrib.py        | 53 +++++++++++++++-------
 python/mxnet/symbol/contrib.py         | 62 +++++++++++++++++---------
 tests/python/unittest/test_operator.py | 48 ++++++++------------
 3 files changed, 99 insertions(+), 64 deletions(-)

diff --git a/python/mxnet/ndarray/contrib.py b/python/mxnet/ndarray/contrib.py
index 9d7925c0ed9c..efc448639fc3 100644
--- a/python/mxnet/ndarray/contrib.py
+++ b/python/mxnet/ndarray/contrib.py
@@ -22,7 +22,7 @@
 from ..context import current_context
 from ..random import uniform
 from ..base import _as_list
-from .op import stack
+from . import ndarray
 try:
     from .gen_contrib import *
 except ImportError:
@@ -110,10 +110,10 @@ def foreach(func, data, init_states):
 
     out, states = func(data1, states)
 
-    data1 can be either a symbol or a list of symbols. If data is a symbol,
-    data1 is a symbol. Otherwise, data1 is a list of symbols and has the same
-    size as data. states is a list of symbols and have the same size as init_states.
-    Similarly, out can be either a symbol or a list of symbols, which are concatenated
+    data1 can be either an NDArray or a list of NDArrays. If data is an NDArray,
+    data1 is an NDArray. Otherwise, data1 is a list of NDArrays and has the same
+    size as data. states is a list of NDArrays and have the same size as init_states.
+    Similarly, out can be either an NDArray or a list of NDArrays, which are concatenated
     as the first output of foreach; states from the last execution of func
     are the second output of foreach.
 
@@ -133,18 +133,18 @@ def foreach(func, data, init_states):
     ----------
     func : a Python function.
         Define computation in an iteration.
-    data: a symbol or a list of symbols.
+    data: an NDArray or a list of NDArrays.
         The input data.
-    init_states: a list of symbols.
+    init_states: an NDArray or a list of NDArrays.
         The initial values of the loop states.
     name: string.
         The name of the operator.
 
     Returns
     -------
-    outputs: a Symbol or a list of Symbols.
+    outputs: an NDArray or a list of NDArrays.
         The output data concatenated from the output of all iterations.
-    states: a list of Symbols.
+    states: a list of NDArrays.
         The loop states in the last iteration.
 
     Examples
@@ -155,12 +155,32 @@ def foreach(func, data, init_states):
     >>> outs, states = mx.nd.contrib.foreach(step, data, states)
     """
 
-    assert isinstance(init_states, list), "init_states should be a list"
+    def check_input(inputs, in_type, msg):
+        is_NDArray_or_list = True
+        if isinstance(inputs, list):
+            for i in inputs:
+                if not isinstance(i, in_type):
+                    is_NDArray_or_list = False
+                    break
+        else:
+            is_NDArray_or_list = isinstance(inputs, in_type)
+        assert is_NDArray_or_list, msg
+
+    check_input(data, ndarray.NDArray, "data should be an NDArray or a list of NDArrays")
+    check_input(init_states, ndarray.NDArray,
+            "init_states should be an NDArray or a list of NDArrays")
+
+    not_data_list = isinstance(data, ndarray.NDArray)
+    not_state_list = isinstance(init_states, ndarray.NDArray)
+    num_iters = data.shape[0] if not_data_list else data[0].shape[0]
     states = init_states
     outputs = []
-    for i in range(data.shape[0]):
-        ele = data[i]
-        outs, states = func(ele, states)
+    for i in range(num_iters):
+        if not_data_list:
+            eles = data[i]
+        else:
+            eles = [d[i] for d in data]
+        outs, states = func(eles, states)
         outs = _as_list(outs)
         if i == 0:
             # outputs is a list of lists
@@ -169,6 +189,9 @@ def foreach(func, data, init_states):
         else:
             for j, out in enumerate(outs):
                 outputs[j].append(out)
-    for out in outputs:
-        out = stack(*out)
+    for j, out in enumerate(outputs):
+        outputs[j] = ndarray.op.stack(*out)
+
+    if not_data_list:
+        outputs = outputs[0]
     return (outputs, states)
diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index b1e448b4352a..5f13e365a0cc 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -147,7 +147,7 @@ def foreach(func, data, init_states, name="foreach"):
         Define computation in an iteration.
     data: a symbol or a list of symbols.
         The input data.
-    init_states: a list of symbols.
+    init_states: a symbol or a list of symbols.
         The initial values of the loop states.
     name: string.
         The name of the operator.
@@ -167,8 +167,21 @@ def foreach(func, data, init_states, name="foreach"):
     >>> outs, states = mx.sym.contrib.foreach(step, data, states)
     """
 
-    assert isinstance(init_states, list), "init_states should be a list"
-    states = []
+    def check_data(inputs, in_type, msg):
+        is_NDArray_or_list = True
+        if isinstance(inputs, list):
+            for i in inputs:
+                if not isinstance(i, in_type):
+                    is_NDArray_or_list = False
+                    break
+        else:
+            is_NDArray_or_list = isinstance(inputs, in_type)
+        assert is_NDArray_or_list, msg
+
+    check_data(data, symbol.Symbol, "data should be an NDArray or a list of NDArrays")
+    check_data(init_states, symbol.Symbol,
+            "init_states should be an NDArray or a list of NDArrays")
+    not_state_list = isinstance(init_states, symbol.Symbol)
 
     # TODO(zhengda) If the input python function references to the symbols outside
     # the python function, we need to prune the computation graph constructed from
@@ -179,29 +192,33 @@ def foreach(func, data, init_states, name="foreach"):
             in_eles = [symbol.var(sym.name) for sym in data]
         else:
             in_eles = symbol.var(data.name)
-        for s in init_states:
-            states.append(symbol.var(s.name))
-
-        sym_out = func(in_eles, states)
-        # The function should return a tuple. The first element goes to
-        # the output of the function. The second element is a list.
-        assert isinstance(sym_out, tuple), "func should return a tuple (out, states)"
-        assert isinstance(sym_out[1], list), \
-                "the second element in the returned tuple should be a list"
-        assert len(sym_out[1]) == len(init_states), \
+        if isinstance(init_states, list):
+            states = [symbol.var(s.name) for s in init_states]
+        else:
+            states = symbol.var(init_states.name)
+        sym_out, sym_states = func(in_eles, states)
+
+    check_data(sym_out, symbol.Symbol, "the output should be an NDArray or a list of NDArrays")
+    check_data(sym_states, symbol.Symbol,
+            "the output states should be an NDArray or a list of NDArrays")
+    if isinstance(sym_states, list):
+        assert isinstance(init_states, list) and len(sym_states) == len(init_states), \
                 "the number of output states (%d) should be the same as input states (%d)" \
-                % (len(sym_out[1]), len(init_states))
+                % (len(sym_states), len(init_states))
 
-        if isinstance(sym_out[0], list):
-            flat_out = sym_out[0]
-        else:
-            flat_out = [sym_out[0]]
-        num_out_data = len(flat_out)
-        for s in sym_out[1]:
+    if isinstance(sym_out, list):
+        flat_out = sym_out
+    else:
+        flat_out = [sym_out]
+    num_out_data = len(flat_out)
+    if isinstance(sym_states, list):
+        for s in sym_states:
             # There is a problem if the outputs are the same as the inputs
             # or the first output. By calling identity, we can make sure that
             # all symbols will refer to different NDArrays.
             flat_out.append(symbol.op.identity(s))
+    else:
+        flat_out.append(symbol.op.identity(sym_states))
     g = symbol.Group(flat_out)
     input_syms = _get_graph_inputs(g)
 
@@ -248,4 +265,9 @@ def foreach(func, data, init_states, name="foreach"):
     for i in range(num_states):
         states.append(ret[num_outputs - num_states + i])
 
+    if not_state_list:
+        # If there is only one input state, there should be only one output state.
+        assert len(states) == 1
+        states = states[0]
+
     return (outs, states)
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index e69f238fbbfd..aac88e023991 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5959,13 +5959,12 @@ def step3(in1, states, free):
     def verify_foreach(step, in_syms, state_syms, free_syms,
             in_arrs, init_states, frees, out_grads, is_train=True):
         step_sym = lambda in_syms, state_syms : step(in_syms, state_syms, free_syms)
-        step_imp = lambda in_arrs, state_arrs : step(in_arrs, state_arrs, frees)
-        out = mx.sym.contrib.foreach(step_sym, in_syms, state_syms)
-        out1 = _as_list(out[0])
-        for i in range(len(out1)):
-            out1[i] = out1[i] * 2
-        out1.extend(out[1])
-        out = mx.sym.Group(out1)
+        res, states = mx.sym.contrib.foreach(step_sym, in_syms, state_syms)
+        out = _as_list(res)
+        for i in range(len(out)):
+            out[i] = out[i] * 2
+        out.extend(states)
+        out = mx.sym.Group(out)
         arr_grads = []
         arg_dict = {}
         arg_grad_dict = {}
@@ -5994,8 +5993,7 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
             name = name[1:]
             gin_order.append(int(name))
 
-        e = out.bind(ctx=mx.cpu(), args=arg_dict, args_grad=arg_grad_dict,
-                )
+        e = out.bind(ctx=mx.cpu(), args=arg_dict, args_grad=arg_grad_dict)
         e.forward(is_train=is_train)
         if (is_train):
             # backward
@@ -6013,29 +6011,21 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
             arr.attach_grad()
         for arr in frees:
             arr.attach_grad()
+        step_imp = lambda in_arrs, state_arrs : step(in_arrs, state_arrs, frees)
         with mx.autograd.record():
             states = [mx.nd.expand_dims(s, 0) for s in init_states]
-            if isinstance(in_arrs, list):
-                num_iters = in_arrs[0].shape[0]
+            res, states = mx.nd.contrib.foreach(step_imp, in_arrs, init_states)
+
+            res2 = _as_list(res)
+            for i in range(len(res2)):
+                res2[i] = res2[i] * 2
+            if isinstance(states, list):
+                states = [mx.nd.expand_dims(s, 0) for s in states]
+                res2.extend(states)
             else:
-                num_iters = in_arrs.shape[0]
-
-            for i in range(num_iters):
-                if isinstance(in_arrs, list):
-                    data = [mx.nd.expand_dims(arr[i], 0) for arr in in_arrs]
-                else:
-                    data = mx.nd.expand_dims(in_arrs[i], 0)
-                tmp_res = step_imp(data, states)
-                tmp_res1 = _as_list(tmp_res[0])
-                for i in range(len(tmp_res1)):
-                    res[i].append(tmp_res1[i])
-                states = tmp_res[1]
-            res2 = []
-            for l in res:
-                res2.append(mx.nd.concat(*l, dim=0) * 2)
-            tmp_res2 = res2[:]
-            tmp_res2.extend(tmp_res[1])
-            res = mx.nd.concat(*tmp_res2, dim=0)
+                states = mx.nd.expand_dims(states, 0)
+                res2.append(states)
+            res = mx.nd.concat(*res2, dim=0)
 
         tmp_grads = out_grads[0][:]
         tmp_grads1 = [mx.nd.expand_dims(grad, 0) for grad in out_grads[1]]

From b965e6e267f9f287876376adc783c1160c5edcc0 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Mon, 21 May 2018 20:28:54 +0000
Subject: [PATCH 057/135] fix compile error.

---
 src/operator/nn/control_flow.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 3e8e733f572f..76f33f9ff8f8 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -90,9 +90,9 @@ class ForeachState {
   }
 };
 
-void ForeachState::Forward(const std::vector<NDArray> &cinputs,
+void ForeachState::Forward(std::vector<NDArray> cinputs,
                            const std::vector<OpReqType>& req,
-                           const std::vector<NDArray> &coutputs, bool is_recording) {
+                           std::vector<NDArray> coutputs, bool is_recording) {
   using namespace nnvm;
   using namespace imperative;
 

From 57fcb84e53c6e2f0f1f044bb660ff369b0a56432 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Mon, 21 May 2018 22:40:23 +0000
Subject: [PATCH 058/135] address comments.

---
 include/mxnet/c_api.h             |  4 ++--
 include/mxnet/ndarray.h           |  2 +-
 python/mxnet/ndarray/contrib.py   | 27 +++++++++++----------------
 python/mxnet/symbol/contrib.py    | 18 +++++++++---------
 src/c_api/c_api_symbolic.cc       | 10 +++++-----
 src/imperative/imperative_utils.h |  7 +++----
 src/operator/nn/control_flow.cc   | 14 +++++++++-----
 7 files changed, 40 insertions(+), 42 deletions(-)

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 791a7e3aca29..afdd666765a5 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1058,8 +1058,8 @@ MXNET_DLL int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator creator,
  * \param outs The input symbols of the graph.
  * \param out_size the number of input symbols returned.
  */
-MXNET_DLL int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle **outs,
-                                      int *out_size);
+MXNET_DLL int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle **inputs,
+                                      int *input_size);
 
 /*!
  * \brief Get the detailed information about atomic symbol.
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index e88068537d0c..9e0f0b289d28 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -703,7 +703,7 @@ class NDArray {
   NDArray MKLDNNDataReshape(const TShape &shape) const;
 #endif
 
-  const nnvm::NodeEntry &GetAutogradEntry() const {
+  const nnvm::NodeEntry &entry() const {
     return entry_;
   }
 
diff --git a/python/mxnet/ndarray/contrib.py b/python/mxnet/ndarray/contrib.py
index efc448639fc3..b5839cd457be 100644
--- a/python/mxnet/ndarray/contrib.py
+++ b/python/mxnet/ndarray/contrib.py
@@ -98,23 +98,23 @@ def rand_zipfian(true_classes, num_sampled, range_max, ctx=None):
     return sampled_classes, expected_count_true, expected_count_sampled
 # pylint: enable=line-too-long
 
-def foreach(func, data, init_states):
+def foreach(body, data, init_states):
     """Run a for loop with user-defined computation over NDArrays on dimension 0.
 
-    This operator simulates a for loop and func has the computation for an iteration
-    of the for loop. It runs the computation in func on each slice from the input
+    This operator simulates a for loop and body has the computation for an iteration
+    of the for loop. It runs the computation in body on each slice from the input
     NDArrays.
 
-    func takes two arguments as input and outputs a tuple of two elements,
+    body takes two arguments as input and outputs a tuple of two elements,
     as illustrated below:
 
-    out, states = func(data1, states)
+    out, states = body(data1, states)
 
     data1 can be either an NDArray or a list of NDArrays. If data is an NDArray,
     data1 is an NDArray. Otherwise, data1 is a list of NDArrays and has the same
     size as data. states is a list of NDArrays and have the same size as init_states.
     Similarly, out can be either an NDArray or a list of NDArrays, which are concatenated
-    as the first output of foreach; states from the last execution of func
+    as the first output of foreach; states from the last execution of body
     are the second output of foreach.
 
     The computation done by this operator is equivalent to the pseudo code below
@@ -124,14 +124,14 @@ def foreach(func, data, init_states):
     outs = []
     for i in data.shape[0]:
         s = data[i]
-        out, states = func(s, states)
+        out, states = body(s, states)
         outs.append(out)
     outs = stack(*outs)
 
 
     Parameters
     ----------
-    func : a Python function.
+    body : a Python function.
         Define computation in an iteration.
     data: an NDArray or a list of NDArrays.
         The input data.
@@ -180,15 +180,10 @@ def check_input(inputs, in_type, msg):
             eles = data[i]
         else:
             eles = [d[i] for d in data]
-        outs, states = func(eles, states)
+        outs, states = body(eles, states)
         outs = _as_list(outs)
-        if i == 0:
-            # outputs is a list of lists
-            for out in outs:
-                outputs.append([out])
-        else:
-            for j, out in enumerate(outs):
-                outputs[j].append(out)
+        outputs.append(outs)
+    outputs = zip(*outputs)
     for j, out in enumerate(outputs):
         outputs[j] = ndarray.op.stack(*out)
 
diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 5f13e365a0cc..383582393ac7 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -110,23 +110,23 @@ def _get_graph_inputs(subg):
         syms.append(s)
     return syms
 
-def foreach(func, data, init_states, name="foreach"):
+def foreach(body, data, init_states, name="foreach"):
     """Run a for loop with user-defined computation over NDArrays on dimension 0.
 
-    This operator simulates a for loop and func has the computation for an iteration
-    of the for loop. It runs the computation in func on each slice from the input
+    This operator simulates a for loop and body has the computation for an iteration
+    of the for loop. It runs the computation in body on each slice from the input
     NDArrays.
 
-    func takes two arguments as input and outputs a tuple of two elements,
+    body takes two arguments as input and outputs a tuple of two elements,
     as illustrated below:
 
-    out, states = func(data1, states)
+    out, states = body(data1, states)
 
     data1 can be either a symbol or a list of symbols. If data is a symbol,
     data1 is a symbol. Otherwise, data1 is a list of symbols and has the same
     size as data. states is a list of symbols and have the same size as init_states.
     Similarly, out can be either a symbol or a list of symbols, which are concatenated
-    as the first output of foreach; states from the last execution of func
+    as the first output of foreach; states from the last execution of body
     are the second output of foreach.
 
     The computation done by this operator is equivalent to the pseudo code below
@@ -136,14 +136,14 @@ def foreach(func, data, init_states, name="foreach"):
     outs = []
     for i in data.shape[0]:
         s = data[i]
-        out, states = func(s, states)
+        out, states = body(s, states)
         outs.append(out)
     outs = stack(*outs)
 
 
     Parameters
     ----------
-    func : a Python function.
+    body : a Python function.
         Define computation in an iteration.
     data: a symbol or a list of symbols.
         The input data.
@@ -196,7 +196,7 @@ def check_data(inputs, in_type, msg):
             states = [symbol.var(s.name) for s in init_states]
         else:
             states = symbol.var(init_states.name)
-        sym_out, sym_states = func(in_eles, states)
+        sym_out, sym_states = body(in_eles, states)
 
     check_data(sym_out, symbol.Symbol, "the output should be an NDArray or a list of NDArrays")
     check_data(sym_states, symbol.Symbol,
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 0829a25c3e7c..175ce563b861 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -345,14 +345,14 @@ int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator creator,
   API_END();
 }
 
-int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle **out_arr, int *out_size) {
+int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle **input_arr, int *input_size) {
   API_BEGIN();
   nnvm::Symbol *s = static_cast<nnvm::Symbol*>(sym);
   nnvm::Graph g;
   g.outputs = s->outputs;
   std::vector<nnvm::Symbol *> input_syms;
   const nnvm::IndexedGraph& idx = g.indexed_graph();
-  size_t max_out_size = *out_size;
+  size_t max_input_size = *input_size;
   // Go through all nodes and return the ones representing variables.
   for (size_t i = 0; i < idx.num_nodes(); i++) {
     const nnvm::Node &n = *idx[i].source;
@@ -365,9 +365,9 @@ int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle **out_arr, int *out_s
       }
     }
   }
-  CHECK(input_syms.size() <= max_out_size);
-  *out_size = input_syms.size();
-  memcpy(out_arr, input_syms.data(), sizeof(*out_arr) * input_syms.size());
+  CHECK(input_syms.size() <= max_input_size);
+  *input_size = input_syms.size();
+  memcpy(input_arr, input_syms.data(), sizeof(*input_arr) * input_syms.size());
   API_END_HANDLE_ERROR();
 }
 
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 479e6b62fbed..5f21a08895e4 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -373,6 +373,7 @@ inline void PushFCompute(const FCompute& fn,
   static auto& fexec_type = nnvm::Op::GetAttr<FExecType>("FExecType");
 
   bool is_train = Imperative::Get()->is_training();
+  bool need_grad = Imperative::Get()->is_recording();
   ExecType exec_type = fexec_type.count(op) ? fexec_type[op](attrs) : ExecType::kSync;
   CHECK(exec_type == ExecType::kSync);
   std::vector<NDArray> inputs, outputs;
@@ -393,7 +394,6 @@ inline void PushFCompute(const FCompute& fn,
                              &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst,
                              &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx);
       // setup context
-      bool need_grad = Imperative::Get()->is_recording();
       OpContext opctx{need_grad, is_train, rctx, engine::CallbackOnComplete(), requested};
       bool is_gpu = ctx.dev_mask() == gpu::kDevMask;
       // pre-fcompute fallback, cast to default storage type
@@ -421,11 +421,11 @@ inline void PushFComputeEx(const FComputeEx& fn,
   static auto& fexec_type = nnvm::Op::GetAttr<FExecType>("FExecType");
 
   bool is_train = Imperative::Get()->is_training();
+  bool need_grad = Imperative::Get()->is_recording();
   ExecType exec_type = fexec_type.count(op) ? fexec_type[op](attrs) : ExecType::kSync;
   std::vector<NDArray> inputs, outputs;
   DerefInputOutput(p_inputs, p_outputs, &inputs, &outputs);
   const auto& run = [=](RunContext rctx) {
-      bool need_grad = Imperative::Get()->is_recording();
       OpContext opctx{need_grad, is_train, rctx, engine::CallbackOnComplete(), requested};
 #if MXNET_USE_MKLDNN == 1
       InvalidateOutputs(outputs, req);
@@ -461,6 +461,7 @@ inline void PushOperator(const OpStatePtr& state,
   static auto& fexec_type = nnvm::Op::GetAttr<FExecType>("FExecType");
 
   bool is_train = Imperative::Get()->is_training();
+  bool need_grad = Imperative::Get()->is_recording();
   ExecType exec_type = fexec_type.count(op) ? fexec_type[op](attrs) : ExecType::kSync;
   std::vector<NDArray> inputs, outputs;
   DerefInputOutput(p_inputs, p_outputs, &inputs, &outputs);
@@ -472,7 +473,6 @@ inline void PushOperator(const OpStatePtr& state,
   if (fcompute_ex != nullptr && dispatch_mode == DispatchMode::kFComputeEx) {
     const auto& run = [=](RunContext rctx,
                           engine::CallbackOnComplete on_complete) {
-      bool need_grad = Imperative::Get()->is_recording();
       OpContext opctx{need_grad, is_train, rctx, on_complete, requested};
 #if MXNET_USE_MKLDNN == 1
       InvalidateOutputs(outputs, req);
@@ -506,7 +506,6 @@ inline void PushOperator(const OpStatePtr& state,
         << "for stateful operator " << op->name;
 
     const auto& run = [=](RunContext rctx, engine::CallbackOnComplete on_complete) {
-        bool need_grad = Imperative::Get()->is_recording();
         OpContext opctx{need_grad, is_train, rctx, on_complete, requested};
 
         std::vector<TBlob> input_blobs, output_blobs;
diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 76f33f9ff8f8..6cb28de02afa 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -78,8 +78,10 @@ class ForeachState {
 
   void Forward(std::vector<NDArray> cinputs,
                const std::vector<OpReqType>& req,
-               std::vector<NDArray> coutputs, bool is_recording);
-  void Backward(int iter_no, std::vector<NDArray> ograds,
+               std::vector<NDArray> coutputs,
+               bool is_recording);
+  void Backward(int iter_no,
+                std::vector<NDArray> ograds,
                 const std::vector<OpReqType> &req,
                 std::vector<NDArray> igrads);
   void Cleanup() {
@@ -92,7 +94,8 @@ class ForeachState {
 
 void ForeachState::Forward(std::vector<NDArray> cinputs,
                            const std::vector<OpReqType>& req,
-                           std::vector<NDArray> coutputs, bool is_recording) {
+                           std::vector<NDArray> coutputs,
+                           bool is_recording) {
   using namespace nnvm;
   using namespace imperative;
 
@@ -150,7 +153,8 @@ void ForeachState::Forward(std::vector<NDArray> cinputs,
   Imperative::Get()->set_is_recording(orig_is_record);
 }
 
-void ForeachState::Backward(int iter_no, std::vector<NDArray> ograds,
+void ForeachState::Backward(int iter_no,
+                            std::vector<NDArray> ograds,
                             const std::vector<OpReqType> &req,
                             std::vector<NDArray> igrads) {
   using namespace nnvm;
@@ -184,7 +188,7 @@ void ForeachState::Backward(int iter_no, std::vector<NDArray> ograds,
   CHECK_EQ(outputs.size(), op->num_inputs());
 
   CHECK(!Imperative::AGInfo::IsNone(all_outputs[iter_no][0]));
-  const nnvm::NodeEntry &node_entry = all_outputs[iter_no][0].GetAutogradEntry();
+  const nnvm::NodeEntry &node_entry = all_outputs[iter_no][0].entry();
   OpStatePtr state = Imperative::AGInfo::Get(node_entry.node).state;
   op->Backward(false, state, inputs, req, outputs);
 }

From c03c56fcaff4a363fdc1afdd2c8202dc2c0f319d Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Mon, 21 May 2018 23:08:40 +0000
Subject: [PATCH 059/135] update.

---
 python/mxnet/symbol/contrib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 383582393ac7..2d067555e8dc 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -111,7 +111,7 @@ def _get_graph_inputs(subg):
     return syms
 
 def foreach(body, data, init_states, name="foreach"):
-    """Run a for loop with user-defined computation over NDArrays on dimension 0.
+    """Run a for loop with user-defined computation over Symbols on dimension 0.
 
     This operator simulates a for loop and body has the computation for an iteration
     of the for loop. It runs the computation in body on each slice from the input

From 224f3e299ccd9d99a2f8d36703bfc9bebdcaa796 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 22 May 2018 00:31:45 +0000
Subject: [PATCH 060/135] check for loop only works for dense arrays.

---
 src/operator/nn/control_flow.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/operator/nn/control_flow.cc b/src/operator/nn/control_flow.cc
index 6cb28de02afa..df28d5cb2cae 100644
--- a/src/operator/nn/control_flow.cc
+++ b/src/operator/nn/control_flow.cc
@@ -211,6 +211,9 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
   }
   for (size_t i = 0; i < (size_t) params.num_out_data; i++)
     CHECK_EQ(len, outputs[i].shape()[iter_dim]);
+  for (const auto &arr : outputs)
+    CHECK_EQ(arr.storage_type(), kDefaultStorage)
+        << "The for operator doesn't support the sparse format";
 
   // Initialize the outputs of the subgraph is a little trickier.
   // The states from the previous iteration are used as the inputs of the next
@@ -302,6 +305,9 @@ static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
   const ForeachParam& params = state.params;
   CHECK_EQ(outputs.size(), (size_t) params.num_args - 1);
   CHECK_GT(params.in_data_locs.ndim(), 0);
+  for (const auto &arr : outputs)
+    CHECK_EQ(arr.storage_type(), kDefaultStorage)
+        << "The for operator doesn't support the sparse format";
   size_t iter_dim = 0;
   std::unordered_set<size_t> in_data_locs(params.in_data_locs.begin(),
                                           params.in_data_locs.end());

From cd67c6ffba8d8ec008c7ee571c6816279a5b1a69 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 22 May 2018 00:32:30 +0000
Subject: [PATCH 061/135] move control flow op out of nn/

---
 src/operator/{nn => }/control_flow.cc       | 0
 src/operator/{nn => }/subgraph_op_common.cc | 0
 src/operator/{nn => }/subgraph_op_common.h  | 6 +++---
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename src/operator/{nn => }/control_flow.cc (100%)
 rename src/operator/{nn => }/subgraph_op_common.cc (100%)
 rename src/operator/{nn => }/subgraph_op_common.h (93%)

diff --git a/src/operator/nn/control_flow.cc b/src/operator/control_flow.cc
similarity index 100%
rename from src/operator/nn/control_flow.cc
rename to src/operator/control_flow.cc
diff --git a/src/operator/nn/subgraph_op_common.cc b/src/operator/subgraph_op_common.cc
similarity index 100%
rename from src/operator/nn/subgraph_op_common.cc
rename to src/operator/subgraph_op_common.cc
diff --git a/src/operator/nn/subgraph_op_common.h b/src/operator/subgraph_op_common.h
similarity index 93%
rename from src/operator/nn/subgraph_op_common.h
rename to src/operator/subgraph_op_common.h
index 2025a2baefa2..25cbd60f5b63 100644
--- a/src/operator/nn/subgraph_op_common.h
+++ b/src/operator/subgraph_op_common.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef MXNET_OPERATOR_NN_SUBGRAPH_OP_COMMON_H_
-#define MXNET_OPERATOR_NN_SUBGRAPH_OP_COMMON_H_
+#ifndef MXNET_OPERATOR_SUBGRAPH_OP_COMMON_H_
+#define MXNET_OPERATOR_SUBGRAPH_OP_COMMON_H_
 
 #include <mxnet/io.h>
 #include <mxnet/base.h>
@@ -58,4 +58,4 @@ bool InferSubgraphBackwardStorage(const nnvm::Symbol &subgraph,
 }  // namespace op
 }  // namespace mxnet
 
-#endif  // MXNET_OPERATOR_NN_SUBGRAPH_OP_COMMON_H_
+#endif  // MXNET_OPERATOR_SUBGRAPH_OP_COMMON_H_

From 742ef4061491eb2a47d49676c9fc3810791731d0 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 22 May 2018 17:24:13 +0000
Subject: [PATCH 062/135] fix include.

---
 src/operator/control_flow.cc       | 6 +++---
 src/operator/subgraph_op_common.cc | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index df28d5cb2cae..37d2b3168c6c 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -24,9 +24,9 @@
 #include <mxnet/operator_util.h>
 #include <dmlc/logging.h>
 #include <dmlc/optional.h>
-#include "../operator_common.h"
-#include "../elemwise_op_common.h"
-#include "../../imperative/imperative_utils.h"
+#include "./operator_common.h"
+#include "./elemwise_op_common.h"
+#include "../imperative/imperative_utils.h"
 #include "./subgraph_op_common.h"
 
 namespace mxnet {
diff --git a/src/operator/subgraph_op_common.cc b/src/operator/subgraph_op_common.cc
index ac2218b062fe..8344c24ab558 100644
--- a/src/operator/subgraph_op_common.cc
+++ b/src/operator/subgraph_op_common.cc
@@ -18,8 +18,8 @@
  */
 
 #include "./subgraph_op_common.h"
-#include "../operator_common.h"
-#include "../../imperative/imperative_utils.h"
+#include "./operator_common.h"
+#include "../imperative/imperative_utils.h"
 
 namespace mxnet {
 namespace op {

From 4492949eee04e3f5fa36e3a1fa46ff08304b35dd Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 22 May 2018 17:24:31 +0000
Subject: [PATCH 063/135] add a test in gluon.

---
 tests/python/unittest/test_gluon_rnn.py | 62 ++++++++++++++++---------
 1 file changed, 41 insertions(+), 21 deletions(-)

diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index f291733c5e5c..cd0ffd9b72be 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -18,9 +18,10 @@
 import mxnet as mx
 from mxnet import gluon
 import numpy as np
+import copy
 from numpy.testing import assert_allclose
 import unittest
-from mxnet.test_utils import almost_equal
+from mxnet.test_utils import almost_equal, assert_almost_equal
 
 
 def test_rnn():
@@ -36,33 +37,52 @@ def test_rnn():
     assert outs == [(10, 100), (10, 100), (10, 100)]
 
 
-class RNNLayer(gluon.HybridBlock):
-    def __init__(self, prefix=None, params=None):
-        super(RNNLayer, self).__init__(prefix=prefix, params=params)
-        self.cell = gluon.contrib.rnn.RNNCell(100, prefix='rnn_')
+class TestRNNLayer(gluon.HybridBlock):
+    def __init__(self, hidden_size, prefix=None, params=None):
+        super(TestRNNLayer, self).__init__(prefix=prefix, params=params)
+        self.cell = gluon.rnn.RNNCell(hidden_size, prefix='rnn_')
 
-    def hybrid_forward(self, F, inputs, states=None):
-        return self.cell.unroll(inputs, states)
+    def hybrid_forward(self, F, inputs, states):
+        states = [states]
+        out, states = F.contrib.foreach(self.cell, inputs, states)
+        return out
 
 def test_contrib_rnn():
-    contrib_cell = gluon.contrib.rnn.RNNCell(100, prefix='rnn_')
-    inputs = mx.sym.Variable('rnn_data')
-    contrib_outputs, _ = contrib_cell.unroll(inputs)
-    assert sorted(contrib_cell.collect_params().keys()) == ['rnn_h2h_bias', 'rnn_h2h_weight',
-                                                            'rnn_i2h_bias', 'rnn_i2h_weight']
-
-    args, outs, auxs = contrib_outputs.infer_shape(rnn_data=(3, 10,50))
-    assert outs == [(3, 10, 100)]
-
-    rnn_data = mx.nd.normal(loc=0, scale=1, shape=(3, 10, 50))
-    layer = RNNLayer()
+    batch_size = 10
+    hidden_size = 100
+    rnn_data = mx.nd.normal(loc=0, scale=1, shape=(5, batch_size, 50))
+    states = mx.nd.normal(loc=0, scale=1, shape=(batch_size, hidden_size))
+    layer = TestRNNLayer(hidden_size)
     layer.initialize(ctx=mx.cpu(0))
-    res1 = layer(rnn_data)
+    res1 = layer(rnn_data, states)
+    params1 = layer.collect_params()
+    orig_params1 = copy.deepcopy(params1)
+
+    trainer = gluon.Trainer(params1, 'sgd', {'learning_rate' : 0.03})
+    with mx.autograd.record():
+        res1 = layer(rnn_data, states)
+    res1.backward()
+    trainer.step(batch_size)
 
-    layer = RNNLayer()
+    layer = TestRNNLayer(hidden_size)
     layer.initialize(ctx=mx.cpu(0))
     layer.hybridize()
-    res2 = layer(rnn_data)
+    res2 = layer(rnn_data, states)
+    params2 = layer.collect_params()
+    for key, val in orig_params1.items():
+        params2[key].set_data(val.data())
+
+    trainer = gluon.Trainer(params2, 'sgd', {'learning_rate' : 0.03})
+    with mx.autograd.record():
+        res2 = layer(rnn_data, states)
+    assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=0.001, atol=0.0001)
+    res2.backward()
+    trainer.step(batch_size)
+
+    for key, val in params1.items():
+        weight1 = val.data()
+        weight2 = params2[key].data()
+        assert_almost_equal(weight1.asnumpy(), weight2.asnumpy(), rtol=0.001, atol=0.0001)
 
 
 def test_lstm():

From 26e3e7e46928ce9d4858c7b4e774f0af688611a4 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 22 May 2018 17:58:10 +0000
Subject: [PATCH 064/135] small fix.

---
 src/executor/graph_executor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 4478e0bb44b2..5616000f2743 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -1668,7 +1668,7 @@ GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start,
     if (inode.source->is_variable()) continue;
     // We shouldn't add control flow operators to a segment.
     // We can't execute these operators in the engine.
-    if (op_node.exec->HasSubgraph()) continue;
+    if (op_node.exec->HasSubgraph()) return ret;
     if (op_node.exec->exec_type() != ExecType::kSync) {
       return ret;
     }

From 6bff44860e767b44f8cd387084daf1979bcc0f84 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 22 May 2018 18:02:57 +0000
Subject: [PATCH 065/135] remove subgraph_name

---
 python/mxnet/symbol/contrib.py | 19 +++++++++----------
 src/c_api/c_api_symbolic.cc    |  5 ++---
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 2d067555e8dc..c1a04a105d74 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -187,16 +187,15 @@ def check_data(inputs, in_type, msg):
     # the python function, we need to prune the computation graph constructed from
     # the function. One way of doing it is to mark the nodes in the computation graph
     # with AttrScope and prune the nodes without the special attribute.
-    with AttrScope(subgraph_name=name):
-        if isinstance(data, list):
-            in_eles = [symbol.var(sym.name) for sym in data]
-        else:
-            in_eles = symbol.var(data.name)
-        if isinstance(init_states, list):
-            states = [symbol.var(s.name) for s in init_states]
-        else:
-            states = symbol.var(init_states.name)
-        sym_out, sym_states = body(in_eles, states)
+    if isinstance(data, list):
+        in_eles = [symbol.var(sym.name) for sym in data]
+    else:
+        in_eles = symbol.var(data.name)
+    if isinstance(init_states, list):
+        states = [symbol.var(s.name) for s in init_states]
+    else:
+        states = symbol.var(init_states.name)
+    sym_out, sym_states = body(in_eles, states)
 
     check_data(sym_out, symbol.Symbol, "the output should be an NDArray or a list of NDArrays")
     check_data(sym_states, symbol.Symbol,
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 175ce563b861..61a111a5b0ff 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -38,11 +38,10 @@ void RegisterLegacyOpProp();
 void RegisterLegacyNDFunc();
 }
 const std::vector<std::string> kHiddenKeys = {
-  "ctx_group", "lr_mult", "wd_mult", "force_mirroring", "mirror_stage", "subgraph_name"
+  "ctx_group", "lr_mult", "wd_mult", "force_mirroring", "mirror_stage"
 };
 const std::vector<std::string> kReplacedHiddenKeys = {
-  "__ctx_group__", "__lr_mult__", "__wd_mult__", "__force_mirroring__", "__mirror_stage__",
-  "subgraph_name"
+  "__ctx_group__", "__lr_mult__", "__wd_mult__", "__force_mirroring__", "__mirror_stage__"
 };
 const char *kNamespaceSeparator = "$";
 

From 1e4cd456fe43e5d92ec1206864129b246e8ecafb Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 22 May 2018 18:32:41 +0000
Subject: [PATCH 066/135] create loop state for reuse in the future.

---
 src/operator/control_flow.cc       | 131 +----------------------------
 src/operator/subgraph_op_common.cc | 101 ++++++++++++++++++++++
 src/operator/subgraph_op_common.h  |  37 ++++++++
 3 files changed, 140 insertions(+), 129 deletions(-)

diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index 37d2b3168c6c..0210bd45c80a 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -57,142 +57,15 @@ struct ForeachParam : public dmlc::Parameter<ForeachParam> {
 
 DMLC_REGISTER_PARAMETER(ForeachParam);
 
-class ForeachState {
-  // These are output arrays from all iterations.
-  // They also contain the Op state for each CachedOp.
-  std::vector<std::vector<NDArray> > all_outputs;
-  std::vector<std::vector<NDArray> > all_inputs;
-  std::vector<std::vector<NDArray> > all_gradients;
-  std::vector<CachedOpPtr> iter_ops;
-
+class ForeachState: public LoopState {
  public:
-  Symbol subgraph_sym;
-  nnvm::Graph subgraph;
   ForeachParam params;
 
-  ForeachState(const Symbol &g, const ForeachParam &params) {
-    this->subgraph_sym = g;
-    this->subgraph.outputs = g.outputs;
+  ForeachState(const Symbol &g, const ForeachParam &params) : LoopState(g) {
     this->params = params;
   }
-
-  void Forward(std::vector<NDArray> cinputs,
-               const std::vector<OpReqType>& req,
-               std::vector<NDArray> coutputs,
-               bool is_recording);
-  void Backward(int iter_no,
-                std::vector<NDArray> ograds,
-                const std::vector<OpReqType> &req,
-                std::vector<NDArray> igrads);
-  void Cleanup() {
-    all_outputs.clear();
-    all_inputs.clear();
-    all_gradients.clear();
-    iter_ops.clear();
-  }
 };
 
-void ForeachState::Forward(std::vector<NDArray> cinputs,
-                           const std::vector<OpReqType>& req,
-                           std::vector<NDArray> coutputs,
-                           bool is_recording) {
-  using namespace nnvm;
-  using namespace imperative;
-
-  bool orig_is_record;
-  if (is_recording)
-    orig_is_record = Imperative::Get()->set_is_recording(true);
-  else
-    orig_is_record = Imperative::Get()->is_recording();
-
-  std::vector<NDArray *> inputs(cinputs.size());
-  std::vector<NDArray *> outputs(coutputs.size());
-  for (size_t i = 0; i < inputs.size(); i++)
-    inputs[i] = &cinputs[i];
-  for (size_t i = 0; i < outputs.size(); i++)
-    outputs[i] = &coutputs[i];
-
-  if (is_recording) {
-    all_inputs.push_back(cinputs);
-    std::vector<NDArray> gradients(cinputs.size());
-    std::vector<NDArray *> input_ptrs(cinputs.size());
-    std::vector<NDArray *> gradient_ptrs(cinputs.size());
-    std::vector<mx_uint> grad_reqs(cinputs.size());
-    for (size_t i = 0; i < gradients.size(); i++) {
-      gradients[i] = NDArray(cinputs[i].shape(), cinputs[i].ctx(),
-                             true, cinputs[i].dtype());
-      input_ptrs[i] = &cinputs[i];
-      gradient_ptrs[i] = &gradients[i];
-      grad_reqs[i] = kWriteTo;
-    }
-    Imperative::Get()->MarkVariables(input_ptrs, grad_reqs, gradient_ptrs);;
-  }
-
-  std::vector<std::pair<std::string, std::string> > kwargs;
-  kwargs.push_back(std::pair<std::string, std::string>("inline_limit", "0"));
-  // Get input names.
-  const auto& idx = subgraph.indexed_graph();
-  std::vector<std::string> arg_names(idx.input_nodes().size());
-  for (size_t i = 0; i < idx.input_nodes().size(); ++i)
-    arg_names[i] = idx[idx.input_nodes()[i]].source->attrs.name;
-  // We don't have parameters for the cached op.
-  std::unordered_map<std::string, std::vector<NDArray> > params;
-  CachedOpPtr op = std::make_shared<Imperative::CachedOp>(subgraph_sym, kwargs,
-                                                          arg_names, params);
-  // TODO(zhengda) we need to avoid shape inference and memory plan whenever the op is
-  // called. Currently, CachedOp allocates memory each time Forward is called.
-  // I need to fix this once the PR for static memory allocation in CachedOp is
-  // merged. https://github.com/apache/incubator-mxnet/pull/10817
-  op->Forward(nullptr, inputs, outputs);
-
-  if (is_recording) {
-    all_outputs.push_back(coutputs);
-    iter_ops.push_back(op);
-  }
-
-  Imperative::Get()->set_is_recording(orig_is_record);
-}
-
-void ForeachState::Backward(int iter_no,
-                            std::vector<NDArray> ograds,
-                            const std::vector<OpReqType> &req,
-                            std::vector<NDArray> igrads) {
-  using namespace nnvm;
-  using namespace imperative;
-
-  CHECK_GT(iter_ops.size(), iter_no)
-      << "We didn't record the computation for iteration " << iter_no;
-  auto op = iter_ops[iter_no];
-  std::vector<NDArray *> inputs;
-  std::vector<NDArray *> outputs;
-  inputs.reserve(op->num_backward_inputs());
-  outputs.reserve(op->num_inputs());
-  for (size_t i = 0; i < ograds.size(); i++)
-    inputs.push_back(&ograds[i]);
-
-  const std::vector<bool> &save_inputs = op->save_inputs();
-  const std::vector<bool> &save_outputs = op->save_outputs();
-  CHECK_EQ(save_inputs.size(), all_inputs[iter_no].size());
-  CHECK_EQ(op->num_outputs(), all_outputs[iter_no].size());
-  for (size_t i = 0; i < all_inputs[iter_no].size(); i++) {
-    if (save_inputs[i])
-      inputs.push_back(&all_inputs[iter_no][i]);
-  }
-  for (size_t i = 0; i < all_outputs[iter_no].size(); i++) {
-    if (save_outputs[i])
-      inputs.push_back(&all_outputs[iter_no][i]);
-  }
-  CHECK_EQ(inputs.size(), op->num_backward_inputs());
-  for (size_t i = 0; i < igrads.size(); i++)
-    outputs.push_back(&igrads[i]);
-  CHECK_EQ(outputs.size(), op->num_inputs());
-
-  CHECK(!Imperative::AGInfo::IsNone(all_outputs[iter_no][0]));
-  const nnvm::NodeEntry &node_entry = all_outputs[iter_no][0].entry();
-  OpStatePtr state = Imperative::AGInfo::Get(node_entry.node).state;
-  op->Backward(false, state, inputs, req, outputs);
-}
-
 static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
                                 const OpContext& ctx,
                                 const std::vector<NDArray>& inputs,
diff --git a/src/operator/subgraph_op_common.cc b/src/operator/subgraph_op_common.cc
index 8344c24ab558..fa22898c13d4 100644
--- a/src/operator/subgraph_op_common.cc
+++ b/src/operator/subgraph_op_common.cc
@@ -151,5 +151,106 @@ bool InferSubgraphBackwardStorage(const nnvm::Symbol &subgraph,
   return true;
 }
 
+void LoopState::Forward(std::vector<NDArray> cinputs,
+                        const std::vector<OpReqType>& req,
+                        std::vector<NDArray> coutputs,
+                        bool is_recording) {
+  using namespace nnvm;
+  using namespace imperative;
+
+  bool orig_is_record;
+  if (is_recording)
+    orig_is_record = Imperative::Get()->set_is_recording(true);
+  else
+    orig_is_record = Imperative::Get()->is_recording();
+
+  std::vector<NDArray *> inputs(cinputs.size());
+  std::vector<NDArray *> outputs(coutputs.size());
+  for (size_t i = 0; i < inputs.size(); i++)
+    inputs[i] = &cinputs[i];
+  for (size_t i = 0; i < outputs.size(); i++)
+    outputs[i] = &coutputs[i];
+
+  if (is_recording) {
+    all_inputs.push_back(cinputs);
+    std::vector<NDArray> gradients(cinputs.size());
+    std::vector<NDArray *> input_ptrs(cinputs.size());
+    std::vector<NDArray *> gradient_ptrs(cinputs.size());
+    std::vector<mx_uint> grad_reqs(cinputs.size());
+    for (size_t i = 0; i < gradients.size(); i++) {
+      gradients[i] = NDArray(cinputs[i].shape(), cinputs[i].ctx(),
+                             true, cinputs[i].dtype());
+      input_ptrs[i] = &cinputs[i];
+      gradient_ptrs[i] = &gradients[i];
+      grad_reqs[i] = kWriteTo;
+    }
+    Imperative::Get()->MarkVariables(input_ptrs, grad_reqs, gradient_ptrs);;
+  }
+
+  std::vector<std::pair<std::string, std::string> > kwargs;
+  kwargs.push_back(std::pair<std::string, std::string>("inline_limit", "0"));
+  // Get input names.
+  const auto& idx = subgraph.indexed_graph();
+  std::vector<std::string> arg_names(idx.input_nodes().size());
+  for (size_t i = 0; i < idx.input_nodes().size(); ++i)
+    arg_names[i] = idx[idx.input_nodes()[i]].source->attrs.name;
+  // We don't have parameters for the cached op.
+  std::unordered_map<std::string, std::vector<NDArray> > params;
+  CachedOpPtr op = std::make_shared<Imperative::CachedOp>(subgraph_sym, kwargs,
+                                                          arg_names, params);
+  // TODO(zhengda) we need to avoid shape inference and memory plan whenever the op is
+  // called. Currently, CachedOp allocates memory each time Forward is called.
+  // I need to fix this once the PR for static memory allocation in CachedOp is
+  // merged. https://github.com/apache/incubator-mxnet/pull/10817
+  op->Forward(nullptr, inputs, outputs);
+
+  if (is_recording) {
+    all_outputs.push_back(coutputs);
+    iter_ops.push_back(op);
+  }
+
+  Imperative::Get()->set_is_recording(orig_is_record);
+}
+
+void LoopState::Backward(int iter_no,
+                         std::vector<NDArray> ograds,
+                         const std::vector<OpReqType> &req,
+                         std::vector<NDArray> igrads) {
+  using namespace nnvm;
+  using namespace imperative;
+
+  CHECK_GT(iter_ops.size(), iter_no)
+      << "We didn't record the computation for iteration " << iter_no;
+  auto op = iter_ops[iter_no];
+  std::vector<NDArray *> inputs;
+  std::vector<NDArray *> outputs;
+  inputs.reserve(op->num_backward_inputs());
+  outputs.reserve(op->num_inputs());
+  for (size_t i = 0; i < ograds.size(); i++)
+    inputs.push_back(&ograds[i]);
+
+  const std::vector<bool> &save_inputs = op->save_inputs();
+  const std::vector<bool> &save_outputs = op->save_outputs();
+  CHECK_EQ(save_inputs.size(), all_inputs[iter_no].size());
+  CHECK_EQ(op->num_outputs(), all_outputs[iter_no].size());
+  for (size_t i = 0; i < all_inputs[iter_no].size(); i++) {
+    if (save_inputs[i])
+      inputs.push_back(&all_inputs[iter_no][i]);
+  }
+  for (size_t i = 0; i < all_outputs[iter_no].size(); i++) {
+    if (save_outputs[i])
+      inputs.push_back(&all_outputs[iter_no][i]);
+  }
+  CHECK_EQ(inputs.size(), op->num_backward_inputs());
+  for (size_t i = 0; i < igrads.size(); i++)
+    outputs.push_back(&igrads[i]);
+  CHECK_EQ(outputs.size(), op->num_inputs());
+
+  CHECK(!Imperative::AGInfo::IsNone(all_outputs[iter_no][0]));
+  const nnvm::NodeEntry &node_entry = all_outputs[iter_no][0].entry();
+  OpStatePtr state = Imperative::AGInfo::Get(node_entry.node).state;
+  op->Backward(false, state, inputs, req, outputs);
+}
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/subgraph_op_common.h b/src/operator/subgraph_op_common.h
index 25cbd60f5b63..74e7cb2d1ccd 100644
--- a/src/operator/subgraph_op_common.h
+++ b/src/operator/subgraph_op_common.h
@@ -24,6 +24,7 @@
 #include <mxnet/base.h>
 #include <mxnet/op_attr_types.h>
 #include <vector>
+#include "../imperative/imperative_utils.h"
 
 namespace mxnet {
 namespace op {
@@ -55,6 +56,42 @@ bool InferSubgraphBackwardStorage(const nnvm::Symbol &subgraph,
                                   std::vector<int> *in_attrs,
                                   std::vector<int> *out_attrs);
 
+/*
+ * This contains the states for running a loop and provides methods
+ * of running the subgraph computation for an iteration.
+ */
+class LoopState {
+  // These are output arrays from all iterations.
+  // They also contain the Op state for each CachedOp.
+  std::vector<std::vector<NDArray> > all_outputs;
+  std::vector<std::vector<NDArray> > all_inputs;
+  std::vector<std::vector<NDArray> > all_gradients;
+  std::vector<CachedOpPtr> iter_ops;
+  Symbol subgraph_sym;
+  nnvm::Graph subgraph;
+
+ public:
+  LoopState(const Symbol &g) {
+    this->subgraph_sym = g;
+    this->subgraph.outputs = g.outputs;
+  }
+
+  void Forward(std::vector<NDArray> cinputs,
+               const std::vector<OpReqType>& req,
+               std::vector<NDArray> coutputs,
+               bool is_recording);
+  void Backward(int iter_no,
+                std::vector<NDArray> ograds,
+                const std::vector<OpReqType> &req,
+                std::vector<NDArray> igrads);
+  void Cleanup() {
+    all_outputs.clear();
+    all_inputs.clear();
+    all_gradients.clear();
+    iter_ops.clear();
+  }
+};
+
 }  // namespace op
 }  // namespace mxnet
 

From 7079e73660c9697ba3ca52f3a0ddbfc2c6eda9b0 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 22 May 2018 17:42:39 +0000
Subject: [PATCH 067/135] work for GPU.

---
 src/operator/control_flow.cc           | 6 +++++-
 tests/python/unittest/test_operator.py | 8 ++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index 0210bd45c80a..c42aca0944d9 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -390,6 +390,9 @@ NNVM_REGISTER_OP(_foreach)
 .set_attr<nnvm::FInferShape>("FInferShape", ForeachShape)
 .set_attr<nnvm::FInferType>("FInferType", ForeachType)
 .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", ForeachComputeExCPU)
+// Foreach operator works like an executor. Its code will always run on CPU.
+// So the same code can be registered for both CPU and GPU.
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", ForeachComputeExCPU)
 .set_attr<std::string>("key_var_num_args", "num_args")
 .add_argument("fn", "Symbol", "Input graph.")
 .add_argument("data", "NDArray-or-Symbol[]",
@@ -409,7 +412,8 @@ NNVM_REGISTER_OP(_backward_foreach)
 .set_attr_parser(ParamParser<ForeachParam>)
 .set_attr<bool>("TIsLayerOpBackward", true)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", ForeachGradComputeExCPU);
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", ForeachGradComputeExCPU)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", ForeachGradComputeExCPU);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index aac88e023991..e5b9a05fb1b6 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5993,7 +5993,7 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
             name = name[1:]
             gin_order.append(int(name))
 
-        e = out.bind(ctx=mx.cpu(), args=arg_dict, args_grad=arg_grad_dict)
+        e = out.bind(ctx=default_context(), args=arg_dict, args_grad=arg_grad_dict)
         e.forward(is_train=is_train)
         if (is_train):
             # backward
@@ -6112,7 +6112,7 @@ def step(in1, states):
     state = mx.nd.arange(2)
     data_grad = mx.nd.empty(data.shape)
     state_grad = mx.nd.empty(state.shape)
-    e = out.bind(ctx=mx.cpu(), args={'v1':data, 'v2':state},
+    e = out.bind(ctx=default_context(), args={'v1':data, 'v2':state},
             args_grad={'v1':data_grad, 'v2':state_grad})
     e.forward(is_train=True)
     out = mx.nd.zeros_like(data)
@@ -6180,7 +6180,7 @@ def sym_group(out):
     h2h_barr_grad1 = mx.nd.empty(h2h_barr.shape)
     out = mx.sym.contrib.foreach(step, data, [init_h, init_c])
     out = sym_group(out)
-    e1 = out.bind(ctx=mx.cpu(),
+    e1 = out.bind(ctx=default_context(),
                   args={'data': data_arr, 'h': h_arr, 'c': c_arr,
                         'i2h_weight': i2h_warr, 'h2h_weight': h2h_warr,
                         'i2h_bias': i2h_barr, 'h2h_bias': h2h_barr},
@@ -6211,7 +6211,7 @@ def sym_group(out):
         unroll_outs.append(mx.sym.expand_dims(h, axis=0))
     unroll_outs = mx.sym.concat(*unroll_outs, dim=0)
     out = mx.sym.Group([unroll_outs, h, c])
-    e2 = out.bind(ctx=mx.cpu(),
+    e2 = out.bind(ctx=default_context(),
                   args={'data': data_arr, 'h': h_arr, 'c': c_arr,
                         'mylstm_i2h_weight': i2h_warr, 'mylstm_h2h_weight': h2h_warr,
                         'mylstm_i2h_bias': i2h_barr, 'mylstm_h2h_bias': h2h_barr},

From 64f43624b8d0fdec4d3e5e8a22d980dd2e415464 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 29 May 2018 22:18:33 +0000
Subject: [PATCH 068/135] Fix tests.

---
 tests/python/unittest/test_operator.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index e5b9a05fb1b6..0e3ad074beb4 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -6019,10 +6019,14 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
             res2 = _as_list(res)
             for i in range(len(res2)):
                 res2[i] = res2[i] * 2
+            outs = []
+            outs[:] = res2[:]
             if isinstance(states, list):
+                outs.extend(states)
                 states = [mx.nd.expand_dims(s, 0) for s in states]
                 res2.extend(states)
             else:
+                outs.append(states)
                 states = mx.nd.expand_dims(states, 0)
                 res2.append(states)
             res = mx.nd.concat(*res2, dim=0)
@@ -6032,8 +6036,9 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
         tmp_grads.extend(tmp_grads1)
         if (is_train):
             res.backward(mx.nd.concat(*tmp_grads, dim=0))
-        for i in range(len(res2)):
-            assert_almost_equal(e.outputs[i].asnumpy(), res2[i].asnumpy(),
+        for i in range(len(outs)):
+            assert e.outputs[i].shape == outs[i].shape
+            assert_almost_equal(e.outputs[i].asnumpy(), outs[i].asnumpy(),
                     rtol=0.001, atol=0.0001)
         if (is_train):
             all_ins = _as_list(in_arrs)[:]

From 31d91123802b5b9e0e8e35dd87edfbcd85661051 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@users.noreply.github.com>
Date: Wed, 30 May 2018 15:15:08 -0700
Subject: [PATCH 069/135] Fix bugs caused by ctypes (#29)

---
 include/mxnet/c_api.h          | 2 +-
 python/mxnet/symbol/contrib.py | 2 +-
 src/c_api/c_api_symbolic.cc    | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index afdd666765a5..9bac3f773688 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1058,7 +1058,7 @@ MXNET_DLL int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator creator,
  * \param outs The input symbols of the graph.
  * \param out_size the number of input symbols returned.
  */
-MXNET_DLL int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle **inputs,
+MXNET_DLL int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle *inputs,
                                       int *input_size);
 
 /*!
diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index c1a04a105d74..fd12c69af2b9 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -106,7 +106,7 @@ def _get_graph_inputs(subg):
 
     syms = []
     for i in range(num_handles.value):
-        s = Symbol(handles[i])
+        s = Symbol(SymbolHandle(handles[i]))
         syms.append(s)
     return syms
 
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 61a111a5b0ff..6b53fc2083e9 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -344,7 +344,7 @@ int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator creator,
   API_END();
 }
 
-int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle **input_arr, int *input_size) {
+int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle *input_arr, int *input_size) {
   API_BEGIN();
   nnvm::Symbol *s = static_cast<nnvm::Symbol*>(sym);
   nnvm::Graph g;
@@ -366,7 +366,7 @@ int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle **input_arr, int *inp
   }
   CHECK(input_syms.size() <= max_input_size);
   *input_size = input_syms.size();
-  memcpy(input_arr, input_syms.data(), sizeof(*input_arr) * input_syms.size());
+  std::copy(input_syms.begin(), input_syms.end(), input_arr);
   API_END_HANDLE_ERROR();
 }
 

From 601edbefc16cf07687df00060ad6e80c33b6c0c0 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@users.noreply.github.com>
Date: Fri, 1 Jun 2018 13:51:29 -0700
Subject: [PATCH 070/135] Add save/load json in testcases for foreach (#30)

---
 tests/python/unittest/test_operator.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 0e3ad074beb4..bbac4eb8d99d 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5965,6 +5965,10 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
             out[i] = out[i] * 2
         out.extend(states)
         out = mx.sym.Group(out)
+        js_1 = out.tojson()
+        out = mx.sym.load_json(js_1)
+        js_2 = out.tojson()
+        assert js_1 == js_2
         arr_grads = []
         arg_dict = {}
         arg_grad_dict = {}
@@ -6112,6 +6116,10 @@ def step(in1, states):
         out1[i] = out1[i]
     out1.extend(out[1])
     out = mx.sym.Group(out1)
+    js_1 = out.tojson()
+    out = mx.sym.load_json(js_1)
+    js_2 = out.tojson()
+    assert js_1 == js_2
 
     data = mx.nd.arange(4).reshape((1, 2, 2))
     state = mx.nd.arange(2)
@@ -6185,6 +6193,11 @@ def sym_group(out):
     h2h_barr_grad1 = mx.nd.empty(h2h_barr.shape)
     out = mx.sym.contrib.foreach(step, data, [init_h, init_c])
     out = sym_group(out)
+    js_1 = out.tojson()
+    out = mx.sym.load_json(js_1)
+    js_2 = out.tojson()
+    assert js_1 == js_2
+
     e1 = out.bind(ctx=default_context(),
                   args={'data': data_arr, 'h': h_arr, 'c': c_arr,
                         'i2h_weight': i2h_warr, 'h2h_weight': h2h_warr,
@@ -6216,6 +6229,11 @@ def sym_group(out):
         unroll_outs.append(mx.sym.expand_dims(h, axis=0))
     unroll_outs = mx.sym.concat(*unroll_outs, dim=0)
     out = mx.sym.Group([unroll_outs, h, c])
+    js_1 = out.tojson()
+    out = mx.sym.load_json(js_1)
+    js_2 = out.tojson()
+    assert js_1 == js_2
+
     e2 = out.bind(ctx=default_context(),
                   args={'data': data_arr, 'h': h_arr, 'c': c_arr,
                         'mylstm_i2h_weight': i2h_warr, 'mylstm_h2h_weight': h2h_warr,

From f4da935a7bd4cae0d5844447e7e5ab4d76586399 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Mon, 4 Jun 2018 22:58:49 +0000
Subject: [PATCH 071/135] support subgraph in stateful executor.

---
 src/executor/attach_op_execs_pass.cc | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index ca0c76e9df8f..4db1b7651fa0 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -126,6 +126,10 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor {
     PostFCompute(is_gpu);
   }
 
+  bool HasSubgraph() const override {
+    return !attrs_.subgraphs.empty();
+  }
+
   ExecType exec_type() const override {
     return exec_type_;
   }
@@ -138,14 +142,15 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor {
     return state_;
   }
 
-  explicit StatefulComputeExecutor(const OpStatePtr& state,
+  explicit StatefulComputeExecutor(const NodeAttrs& attrs, const OpStatePtr& state,
                                    const FStatefulCompute& fcompute,
                                    ExecType exec_type,
                                    const std::vector<uint32_t> &mutate_idx)
-      : StorageFallbackOpExecutor(mutate_idx),
+      : StorageFallbackOpExecutor(mutate_idx), attrs_(attrs),
         state_(state), fcompute_(fcompute), exec_type_(exec_type) {}
 
  private:
+  NodeAttrs attrs_;
   OpStatePtr state_;
   FStatefulCompute fcompute_;
   ExecType exec_type_;
@@ -163,6 +168,10 @@ class StatefulComputeExExecutor : public OpExecutor {
     fcompute_(state_, op_ctx, in_array, req, out_array);
   }
 
+  bool HasSubgraph() const override {
+    return !attrs_.subgraphs.empty();
+  }
+
   void Setup() override {}
 
   ExecType exec_type() const override {
@@ -177,12 +186,13 @@ class StatefulComputeExExecutor : public OpExecutor {
     return state_;
   }
 
-  explicit StatefulComputeExExecutor(const OpStatePtr& state,
+  explicit StatefulComputeExExecutor(const NodeAttrs& attrs, const OpStatePtr& state,
                                      const FStatefulComputeEx& fcompute,
                                      ExecType exec_type)
-      : state_(state), fcompute_(fcompute), exec_type_(exec_type) {}
+      : attrs_(attrs), state_(state), fcompute_(fcompute), exec_type_(exec_type) {}
 
  private:
+  NodeAttrs attrs_;
   OpStatePtr state_;
   FStatefulComputeEx fcompute_;
   ExecType exec_type_;
@@ -301,14 +311,15 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, size_t i) {
         op, "FStatefulComputeEx", vctx[i]);
     // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
     if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
-      ret[i] = std::make_shared<StatefulComputeExExecutor>(state, fcompute_ex, exec_type);
+      ret[i] = std::make_shared<StatefulComputeExExecutor>(inode.source->attrs, state,
+                                                           fcompute_ex, exec_type);
     } else {
       FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
           op, "FStatefulCompute", vctx[i]);
       CHECK(fcompute != nullptr)
           << "One of FStatefulCompute and FStatefulComputeEx must be registered "
           << "for stateful operator " << op->name;
-      ret[i] = std::make_shared<StatefulComputeExecutor>(state, fcompute,
+      ret[i] = std::make_shared<StatefulComputeExecutor>(inode.source->attrs, state, fcompute,
                                                          exec_type, mutate_index);
     }
   } else if (is_layer_backward.get(op, false)) {
@@ -320,7 +331,7 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, size_t i) {
         op, "FStatefulComputeEx", vctx[i]);
     // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
     if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
-      ret[i] = std::make_shared<StatefulComputeExExecutor>(
+      ret[i] = std::make_shared<StatefulComputeExExecutor>(inode.source->attrs,
           ret[fwd_id].get()->state(), fcompute_ex, exec_type);
     } else {
       FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
@@ -328,7 +339,7 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, size_t i) {
       CHECK(fcompute != nullptr)
           << "One of FStatefulCompute and FStatefulComputeEx must be registered "
           << "for stateful operator " << op->name;
-      ret[i] = std::make_shared<StatefulComputeExecutor>(
+      ret[i] = std::make_shared<StatefulComputeExecutor>(inode.source->attrs,
           ret[fwd_id].get()->state(), fcompute, exec_type, mutate_index);
     }
   } else {

From 90b78291fcd0bbdf34f5a98445c75f01b468f253 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Mon, 4 Jun 2018 22:59:40 +0000
Subject: [PATCH 072/135] Fix compilation.

---
 src/operator/subgraph_op_common.cc | 7 +------
 src/operator/subgraph_op_common.h  | 1 +
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/operator/subgraph_op_common.cc b/src/operator/subgraph_op_common.cc
index fa22898c13d4..815809c995cb 100644
--- a/src/operator/subgraph_op_common.cc
+++ b/src/operator/subgraph_op_common.cc
@@ -191,13 +191,8 @@ void LoopState::Forward(std::vector<NDArray> cinputs,
   kwargs.push_back(std::pair<std::string, std::string>("inline_limit", "0"));
   // Get input names.
   const auto& idx = subgraph.indexed_graph();
-  std::vector<std::string> arg_names(idx.input_nodes().size());
-  for (size_t i = 0; i < idx.input_nodes().size(); ++i)
-    arg_names[i] = idx[idx.input_nodes()[i]].source->attrs.name;
   // We don't have parameters for the cached op.
-  std::unordered_map<std::string, std::vector<NDArray> > params;
-  CachedOpPtr op = std::make_shared<Imperative::CachedOp>(subgraph_sym, kwargs,
-                                                          arg_names, params);
+  CachedOpPtr op = std::make_shared<CachedOp>(subgraph_sym, kwargs);
   // TODO(zhengda) we need to avoid shape inference and memory plan whenever the op is
   // called. Currently, CachedOp allocates memory each time Forward is called.
   // I need to fix this once the PR for static memory allocation in CachedOp is
diff --git a/src/operator/subgraph_op_common.h b/src/operator/subgraph_op_common.h
index 74e7cb2d1ccd..f65eb8ca489c 100644
--- a/src/operator/subgraph_op_common.h
+++ b/src/operator/subgraph_op_common.h
@@ -24,6 +24,7 @@
 #include <mxnet/base.h>
 #include <mxnet/op_attr_types.h>
 #include <vector>
+#include "../imperative/cached_op.h"
 #include "../imperative/imperative_utils.h"
 
 namespace mxnet {

From ae3ea22052c5df915df80e7cef96b0a595aef908 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 22 May 2018 22:07:05 +0000
Subject: [PATCH 073/135] move code.

---
 src/c_api/c_api_symbolic.cc | 21 ++++-----------
 src/nnvm/graph_editor.cc    | 51 +++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 16 deletions(-)
 create mode 100644 src/nnvm/graph_editor.cc

diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 6b53fc2083e9..2030d881d60a 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -344,26 +344,15 @@ int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator creator,
   API_END();
 }
 
+namespace mxnet {
+extern std::vector<nnvm::Symbol *> GetInputSymbols(const nnvm::Symbol &sym);
+}
+
 int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle *input_arr, int *input_size) {
   API_BEGIN();
   nnvm::Symbol *s = static_cast<nnvm::Symbol*>(sym);
-  nnvm::Graph g;
-  g.outputs = s->outputs;
-  std::vector<nnvm::Symbol *> input_syms;
-  const nnvm::IndexedGraph& idx = g.indexed_graph();
   size_t max_input_size = *input_size;
-  // Go through all nodes and return the ones representing variables.
-  for (size_t i = 0; i < idx.num_nodes(); i++) {
-    const nnvm::Node &n = *idx[i].source;
-    for (const nnvm::NodeEntry &e : n.inputs) {
-      auto p = e.node;
-      if (p->is_variable()) {
-        nnvm::Symbol *s = new nnvm::Symbol();
-        s->outputs.push_back(e);
-        input_syms.push_back(s);
-      }
-    }
-  }
+  std::vector<nnvm::Symbol *> input_syms = mxnet::GetInputSymbols(*s);
   CHECK(input_syms.size() <= max_input_size);
   *input_size = input_syms.size();
   std::copy(input_syms.begin(), input_syms.end(), input_arr);
diff --git a/src/nnvm/graph_editor.cc b/src/nnvm/graph_editor.cc
new file mode 100644
index 000000000000..7200f5b124e4
--- /dev/null
+++ b/src/nnvm/graph_editor.cc
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file graph_editor.cc
+ * The functions in this file edit an NNVM graph. Potentially,
+ * these functions should be moved to NNVM in the future.
+ */
+
+#include <nnvm/symbolic.h>
+#include <nnvm/graph.h>
+
+namespace mxnet {
+
+std::vector<nnvm::Symbol *> GetInputSymbols(const nnvm::Symbol &sym) {
+  nnvm::Graph g;
+  std::vector<nnvm::Symbol *> input_syms;
+  g.outputs = sym.outputs;
+  const nnvm::IndexedGraph& idx = g.indexed_graph();
+  // Go through all nodes and return the ones representing variables.
+  for (size_t i = 0; i < idx.num_nodes(); i++) {
+    const nnvm::Node &n = *idx[i].source;
+    for (const nnvm::NodeEntry &e : n.inputs) {
+      auto p = e.node;
+      if (p->is_variable()) {
+        nnvm::Symbol *s = new nnvm::Symbol();
+        s->outputs.push_back(e);
+        input_syms.push_back(s);
+      }
+    }
+  }
+  return input_syms;
+}
+
+}

From 0db16f0b64009e489c1b4a9d63ce6393214bdcf1 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 23 May 2018 23:32:00 +0000
Subject: [PATCH 074/135] Revert "remove subgraph_name"

This reverts commit 977f5624ad0b0dedb9dcb8629f975afc56bb1e1a.
---
 python/mxnet/symbol/contrib.py | 19 ++++++++++---------
 src/c_api/c_api_symbolic.cc    |  5 +++--
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index fd12c69af2b9..fe24c5c9fc65 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -187,15 +187,16 @@ def check_data(inputs, in_type, msg):
     # the python function, we need to prune the computation graph constructed from
     # the function. One way of doing it is to mark the nodes in the computation graph
     # with AttrScope and prune the nodes without the special attribute.
-    if isinstance(data, list):
-        in_eles = [symbol.var(sym.name) for sym in data]
-    else:
-        in_eles = symbol.var(data.name)
-    if isinstance(init_states, list):
-        states = [symbol.var(s.name) for s in init_states]
-    else:
-        states = symbol.var(init_states.name)
-    sym_out, sym_states = body(in_eles, states)
+    with AttrScope(subgraph_name=name):
+        if isinstance(data, list):
+            in_eles = [symbol.var(sym.name) for sym in data]
+        else:
+            in_eles = symbol.var(data.name)
+        if isinstance(init_states, list):
+            states = [symbol.var(s.name) for s in init_states]
+        else:
+            states = symbol.var(init_states.name)
+        sym_out, sym_states = body(in_eles, states)
 
     check_data(sym_out, symbol.Symbol, "the output should be an NDArray or a list of NDArrays")
     check_data(sym_states, symbol.Symbol,
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 2030d881d60a..4ff08e4932c9 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -38,10 +38,11 @@ void RegisterLegacyOpProp();
 void RegisterLegacyNDFunc();
 }
 const std::vector<std::string> kHiddenKeys = {
-  "ctx_group", "lr_mult", "wd_mult", "force_mirroring", "mirror_stage"
+  "ctx_group", "lr_mult", "wd_mult", "force_mirroring", "mirror_stage", "subgraph_name"
 };
 const std::vector<std::string> kReplacedHiddenKeys = {
-  "__ctx_group__", "__lr_mult__", "__wd_mult__", "__force_mirroring__", "__mirror_stage__"
+  "__ctx_group__", "__lr_mult__", "__wd_mult__", "__force_mirroring__", "__mirror_stage__",
+  "subgraph_name"
 };
 const char *kNamespaceSeparator = "$";
 

From 5f626aeff4c69c1853ff411ff7ccc8f2f49f28db Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 25 May 2018 02:38:57 +0000
Subject: [PATCH 075/135] cut graph.

---
 include/mxnet/c_api.h                  | 16 +++++-
 python/mxnet/symbol/contrib.py         | 67 +++++++++++++++++---------
 src/c_api/c_api_symbolic.cc            | 54 +++++++++++++++++++++
 src/nnvm/graph_editor.cc               | 34 +++++++++++++
 tests/python/unittest/test_operator.py | 22 +++++++--
 5 files changed, 165 insertions(+), 28 deletions(-)

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 9bac3f773688..15d213273ad4 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1055,12 +1055,24 @@ MXNET_DLL int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator creator,
 /*!
  * \brief Get the input symbols of the graph.
  * \param sym The graph.
- * \param outs The input symbols of the graph.
- * \param out_size the number of input symbols returned.
+ * \param inputs The input symbols of the graph.
+ * \param input_size the number of input symbols returned.
  */
 MXNET_DLL int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle *inputs,
                                       int *input_size);
 
+/*!
+ * \brief Cut a subgraph whose nodes are marked with a subgraph attribute.
+ * The input graph will be modified. A variable node will be created for each
+ * edge that connects to nodes outside the subgraph. The outside nodes that
+ * connect to the subgraph will be returned.
+ * \param sym The graph.
+ * \param inputs The nodes that connect to the subgraph.
+ * \param input_size The number of such nodes.
+ */
+MXNET_DLL int MXSymbolCutSubgraph(SymbolHandle sym, SymbolHandle **inputs,
+                                  int *input_size);
+
 /*!
  * \brief Get the detailed information about atomic symbol.
  * \param creator the AtomicSymbolCreator.
diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index fe24c5c9fc65..b496b683834f 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -20,6 +20,7 @@
 """Contrib Symbol API of MXNet."""
 import math
 import ctypes
+import re
 
 from .random import uniform
 from .symbol import Symbol
@@ -110,6 +111,17 @@ def _get_graph_inputs(subg):
         syms.append(s)
     return syms
 
+def _cut_subgraph(subg):
+    num_handles = ctypes.c_int(1000)
+    handles = c_array(SymbolHandle, [SymbolHandle(0) for i in range(1000)])
+    check_call(_LIB.MXSymbolCutSubgraph(subg.handle, handles, ctypes.byref(num_handles)))
+
+    syms = []
+    for i in range(num_handles.value):
+        s = Symbol(handles[i])
+        syms.append(s)
+    return syms
+
 def foreach(body, data, init_states, name="foreach"):
     """Run a for loop with user-defined computation over Symbols on dimension 0.
 
@@ -198,28 +210,31 @@ def check_data(inputs, in_type, msg):
             states = symbol.var(init_states.name)
         sym_out, sym_states = body(in_eles, states)
 
-    check_data(sym_out, symbol.Symbol, "the output should be an NDArray or a list of NDArrays")
-    check_data(sym_states, symbol.Symbol,
-            "the output states should be an NDArray or a list of NDArrays")
-    if isinstance(sym_states, list):
-        assert isinstance(init_states, list) and len(sym_states) == len(init_states), \
-                "the number of output states (%d) should be the same as input states (%d)" \
-                % (len(sym_states), len(init_states))
+        check_data(sym_out, symbol.Symbol,
+                "the output should be an NDArray or a list of NDArrays")
+        check_data(sym_states, symbol.Symbol,
+                "the output states should be an NDArray or a list of NDArrays")
+        if isinstance(sym_states, list):
+            assert isinstance(init_states, list) and len(sym_states) == len(init_states), \
+                    "the number of output states (%d) should be the same as input states (%d)" \
+                    % (len(sym_states), len(init_states))
+
+        if isinstance(sym_out, list):
+            flat_out = sym_out
+        else:
+            flat_out = [sym_out]
+        num_out_data = len(flat_out)
+        if isinstance(sym_states, list):
+            for s in sym_states:
+                # There is a problem if the outputs are the same as the inputs
+                # or the first output. By calling identity, we can make sure that
+                # all symbols will refer to different NDArrays.
+                flat_out.append(symbol.op.identity(s))
+        else:
+            flat_out.append(symbol.op.identity(sym_states))
+        g = symbol.Group(flat_out)
 
-    if isinstance(sym_out, list):
-        flat_out = sym_out
-    else:
-        flat_out = [sym_out]
-    num_out_data = len(flat_out)
-    if isinstance(sym_states, list):
-        for s in sym_states:
-            # There is a problem if the outputs are the same as the inputs
-            # or the first output. By calling identity, we can make sure that
-            # all symbols will refer to different NDArrays.
-            flat_out.append(symbol.op.identity(s))
-    else:
-        flat_out.append(symbol.op.identity(sym_states))
-    g = symbol.Group(flat_out)
+    cut_syms = _cut_subgraph(g)
     input_syms = _get_graph_inputs(g)
 
     # Here we need to find out how the input symbols are ordered as well as
@@ -230,12 +245,13 @@ def check_data(inputs, in_type, msg):
     gin_names = input_syms.keys()
     # This array contains the symbols for the inputs of foreach.
     # They are ordered according to the inputs of the subgraph.
-    ordered_ins = []
     states_map = {sym.name:sym for sym in init_states}
     state_names = states_map.keys()
     data_syms = _as_list(data)
     data_map = {sym.name:sym for sym in data_syms}
     data_names = data_map.keys()
+
+    ordered_ins = []
     in_state_locs = []
     in_data_locs = []
     for in_name in g.list_inputs():
@@ -248,7 +264,12 @@ def check_data(inputs, in_type, msg):
             ordered_ins.append(data_map[in_name])
             in_data_locs.append(len(ordered_ins) - 1)
         else:
-            ordered_ins.append(input_syms[in_name])
+            # The remaining inputs are the ones cut from the original graph.
+            # The names of these variable nodes contain the index in cut_syms.
+            m = re.search(r'\d+$', in_name)
+            idx = int(m.group()) if m else None
+            assert idx < len(cut_syms)
+            ordered_ins.append(cut_syms[idx])
 
     num_outputs = len(flat_out)
     num_states = len(state_names)
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 4ff08e4932c9..b825a9670700 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -346,7 +346,13 @@ int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator creator,
 }
 
 namespace mxnet {
+
 extern std::vector<nnvm::Symbol *> GetInputSymbols(const nnvm::Symbol &sym);
+extern bool CutGraph(const std::vector<nnvm::NodeEntry *> &input_entries,
+                     const std::string &in_name_prefix, bool skip_var,
+                     std::vector<nnvm::NodeEntry> *orig_entries,
+                     std::vector<std::string> *new_var_names);
+
 }
 
 int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle *input_arr, int *input_size) {
@@ -360,6 +366,54 @@ int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle *input_arr, int *inpu
   API_END_HANDLE_ERROR();
 }
 
+int MXSymbolCutSubgraph(SymbolHandle sym, SymbolHandle **input_symbols,
+                        int *input_size) {
+  // Given a graph, we want to fetch the nodes that have been marked as part of
+  // a subgraph.
+  API_BEGIN();
+  nnvm::Symbol *s = static_cast<nnvm::Symbol*>(sym);
+  size_t max_input_size = *input_size;
+  std::string subg_attr = "__subgraph_name__";
+  auto out_node = s->outputs[0].node;
+  auto it = out_node->attrs.dict.find(subg_attr);
+  if (it != out_node->attrs.dict.end()) {
+    std::string subg_name = it->second;
+    std::vector<nnvm::NodeEntry *> input_entries;
+    DFSVisit(s->outputs, [subg_attr, subg_name, &input_entries]
+             (nnvm::NodePtr n) {
+      // If the node itself isn't in the subgraph, we ignore it.
+      auto it = n->attrs.dict.find(subg_attr);
+      if (it == n->attrs.dict.end() || it->second != subg_name)
+        return;
+
+      // We search for nodes whose node entries aren't in the subgraph.
+      for (size_t j = 0; j < n->inputs.size(); j++) {
+        auto in_node = n->inputs[j].node;
+        auto it = in_node->attrs.dict.find(subg_attr);
+        if (it == in_node->attrs.dict.end() || it->second != subg_name)
+          input_entries.push_back(&n->inputs[j]);
+      }
+    });
+
+    std::vector<nnvm::NodeEntry> orig_entries;
+    std::vector<std::string> new_var_names;
+    CutGraph(input_entries, subg_name + "_var", false, &orig_entries, &new_var_names);
+
+    std::vector<nnvm::Symbol *> input_syms(orig_entries.size());
+    for (size_t i = 0; i < input_syms.size(); i++) {
+      input_syms[i] = new nnvm::Symbol();
+      input_syms[i]->outputs.push_back(orig_entries[i]);
+    }
+    CHECK(input_syms.size() <= max_input_size);
+    *input_size = input_syms.size();
+    memcpy(input_symbols, input_syms.data(), sizeof(*input_symbols) * input_syms.size());
+  } else {
+    *input_size = 0;
+  }
+
+  API_END_HANDLE_ERROR();
+}
+
 int MXSymbolCreateFromFile(const char *fname, SymbolHandle *out) {
   nnvm::Symbol *s = new nnvm::Symbol();
   API_BEGIN();
diff --git a/src/nnvm/graph_editor.cc b/src/nnvm/graph_editor.cc
index 7200f5b124e4..98c99e2425df 100644
--- a/src/nnvm/graph_editor.cc
+++ b/src/nnvm/graph_editor.cc
@@ -25,9 +25,18 @@
 
 #include <nnvm/symbolic.h>
 #include <nnvm/graph.h>
+#include <nnvm/node.h>
+
+namespace nnvm {
+NodePtr CreateVariableNode(const std::string& name);
+}
 
 namespace mxnet {
 
+/*
+ * Given a computation graph, this function finds the input nodes of the graph
+ * and create symbols for the input nodes. It returns the input symbols.
+ */
 std::vector<nnvm::Symbol *> GetInputSymbols(const nnvm::Symbol &sym) {
   nnvm::Graph g;
   std::vector<nnvm::Symbol *> input_syms;
@@ -48,4 +57,29 @@ std::vector<nnvm::Symbol *> GetInputSymbols(const nnvm::Symbol &sym) {
   return input_syms;
 }
 
+/*
+ * Given a computation graph and a set of input node entries, this function cuts
+ * the node entries and creates new variable nodes as the input nodes of the
+ * subgraph. It returns the nodes that connect to the subgraph directly and
+ * the names of the new variable nodes.
+ */
+bool CutGraph(const std::vector<nnvm::NodeEntry *> &input_entries,
+              const std::string &in_name_prefix, bool skip_var,
+              std::vector<nnvm::NodeEntry> *orig_entries,
+              std::vector<std::string> *new_var_names) {
+  orig_entries->reserve(input_entries.size());
+  for (size_t i = 0; i < input_entries.size(); i++) {
+    nnvm::NodeEntry *e = input_entries[i];
+    // If the node is a variable itself, we may want to skip the node.
+    if (e->node->is_variable() && skip_var)
+      continue;
+
+    orig_entries->push_back(*e);
+    new_var_names->push_back(in_name_prefix + std::to_string(i));
+    nnvm::NodePtr n = nnvm::CreateVariableNode(new_var_names->back());
+    *e = nnvm::NodeEntry{n, 0, 0};
+  }
+  return true;
+}
+
 }
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index bbac4eb8d99d..0ac9d458473d 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5957,7 +5957,8 @@ def step3(in1, states, free):
         return ([out, out * 2], [out * 2, out * 3])
 
     def verify_foreach(step, in_syms, state_syms, free_syms,
-            in_arrs, init_states, frees, out_grads, is_train=True):
+            in_arrs, init_states, frees, out_grads, is_train=True,
+            free_vars_func=None):
         step_sym = lambda in_syms, state_syms : step(in_syms, state_syms, free_syms)
         res, states = mx.sym.contrib.foreach(step_sym, in_syms, state_syms)
         out = _as_list(res)
@@ -6015,8 +6016,9 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
             arr.attach_grad()
         for arr in frees:
             arr.attach_grad()
-        step_imp = lambda in_arrs, state_arrs : step(in_arrs, state_arrs, frees)
         with mx.autograd.record():
+            frees_imp = frees if free_vars_func is None else free_vars_func(frees)
+            step_imp = lambda in_arrs, state_arrs : step(in_arrs, state_arrs, frees_imp)
             states = [mx.nd.expand_dims(s, 0) for s in init_states]
             res, states = mx.nd.contrib.foreach(step_imp, in_arrs, init_states)
 
@@ -6060,7 +6062,21 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
     # * multiple inputs and multiple outputs.
     # * inference.
 
-    states = [mx.nd.random.uniform(shape=(2))]
+    #states = [mx.nd.random.uniform(shape=(2))]
+
+    #frees1 = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2))]
+    #arrs = mx.nd.random.uniform(shape=(3, 2))
+    states = [mx.nd.arange(2)]
+
+    frees1 = [mx.nd.arange(2), mx.nd.arange(2) + 1]
+    arrs = mx.nd.arange(6).reshape(shape=(3, 2))
+    out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],
+            [mx.nd.random.uniform(-10, 10, states[0].shape)]]
+    verify_foreach(step1, v3, [v4], [v5 + v6], arrs, states, frees1, out_grads, True,
+            lambda frees : [frees[0] + frees[1]])
+    verify_foreach(step1, v3, [v4], [v5 + v6], arrs, states, frees1, out_grads, False,
+            lambda frees : [frees[0] + frees[1]])
+
     frees = [mx.nd.random.uniform(shape=(2))]
     arrs = mx.nd.random.uniform(shape=(2, 2))
     out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],

From f2c428fa18256904e519f47fb8d21af63d2cb7bc Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sat, 26 May 2018 01:45:24 +0000
Subject: [PATCH 076/135] rename new var nodes.

---
 python/mxnet/symbol/contrib.py | 11 ++++++-----
 src/c_api/c_api_symbolic.cc    | 10 +++-------
 src/nnvm/graph_editor.cc       | 11 +++++------
 3 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index b496b683834f..8d591224b31f 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -250,6 +250,8 @@ def check_data(inputs, in_type, msg):
     data_syms = _as_list(data)
     data_map = {sym.name:sym for sym in data_syms}
     data_names = data_map.keys()
+    cut_var_map = {sym.list_outputs()[0]:sym for sym in cut_syms}
+    cut_var_names = cut_var_map.keys()
 
     ordered_ins = []
     in_state_locs = []
@@ -264,12 +266,11 @@ def check_data(inputs, in_type, msg):
             ordered_ins.append(data_map[in_name])
             in_data_locs.append(len(ordered_ins) - 1)
         else:
+            assert in_name in cut_var_names
             # The remaining inputs are the ones cut from the original graph.
-            # The names of these variable nodes contain the index in cut_syms.
-            m = re.search(r'\d+$', in_name)
-            idx = int(m.group()) if m else None
-            assert idx < len(cut_syms)
-            ordered_ins.append(cut_syms[idx])
+            # The names of new created variable nodes should match the ones
+            # of the original nodes.
+            ordered_ins.append(cut_var_map[in_name])
 
     num_outputs = len(flat_out)
     num_states = len(state_names)
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index b825a9670700..dd37746e5b23 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -348,10 +348,8 @@ int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator creator,
 namespace mxnet {
 
 extern std::vector<nnvm::Symbol *> GetInputSymbols(const nnvm::Symbol &sym);
-extern bool CutGraph(const std::vector<nnvm::NodeEntry *> &input_entries,
-                     const std::string &in_name_prefix, bool skip_var,
-                     std::vector<nnvm::NodeEntry> *orig_entries,
-                     std::vector<std::string> *new_var_names);
+extern bool CutGraphInputs(const std::vector<nnvm::NodeEntry *> &input_entries,
+                           bool skip_var, std::vector<nnvm::NodeEntry> *orig_entries);
 
 }
 
@@ -396,9 +394,7 @@ int MXSymbolCutSubgraph(SymbolHandle sym, SymbolHandle **input_symbols,
     });
 
     std::vector<nnvm::NodeEntry> orig_entries;
-    std::vector<std::string> new_var_names;
-    CutGraph(input_entries, subg_name + "_var", false, &orig_entries, &new_var_names);
-
+    CutGraphInputs(input_entries, false, &orig_entries);
     std::vector<nnvm::Symbol *> input_syms(orig_entries.size());
     for (size_t i = 0; i < input_syms.size(); i++) {
       input_syms[i] = new nnvm::Symbol();
diff --git a/src/nnvm/graph_editor.cc b/src/nnvm/graph_editor.cc
index 98c99e2425df..499780869c35 100644
--- a/src/nnvm/graph_editor.cc
+++ b/src/nnvm/graph_editor.cc
@@ -63,10 +63,8 @@ std::vector<nnvm::Symbol *> GetInputSymbols(const nnvm::Symbol &sym) {
  * subgraph. It returns the nodes that connect to the subgraph directly and
  * the names of the new variable nodes.
  */
-bool CutGraph(const std::vector<nnvm::NodeEntry *> &input_entries,
-              const std::string &in_name_prefix, bool skip_var,
-              std::vector<nnvm::NodeEntry> *orig_entries,
-              std::vector<std::string> *new_var_names) {
+bool CutGraphInputs(const std::vector<nnvm::NodeEntry *> &input_entries,
+                    bool skip_var, std::vector<nnvm::NodeEntry> *orig_entries) {
   orig_entries->reserve(input_entries.size());
   for (size_t i = 0; i < input_entries.size(); i++) {
     nnvm::NodeEntry *e = input_entries[i];
@@ -75,8 +73,9 @@ bool CutGraph(const std::vector<nnvm::NodeEntry *> &input_entries,
       continue;
 
     orig_entries->push_back(*e);
-    new_var_names->push_back(in_name_prefix + std::to_string(i));
-    nnvm::NodePtr n = nnvm::CreateVariableNode(new_var_names->back());
+    nnvm::Symbol sym;
+    sym.outputs.push_back(*e);
+    nnvm::NodePtr n = nnvm::CreateVariableNode(sym.ListOutputNames()[0]);
     *e = nnvm::NodeEntry{n, 0, 0};
   }
   return true;

From 2a6925708cfbfb0f5bb6ffd7e8cc7e6682f18c02 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 8 Jun 2018 00:59:48 +0000
Subject: [PATCH 077/135] fix a bug when a subgraph has variable nodes.

---
 python/mxnet/symbol/contrib.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 8d591224b31f..3c437f283f7b 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -265,12 +265,15 @@ def check_data(inputs, in_type, msg):
         elif in_name in data_names:
             ordered_ins.append(data_map[in_name])
             in_data_locs.append(len(ordered_ins) - 1)
-        else:
-            assert in_name in cut_var_names
-            # The remaining inputs are the ones cut from the original graph.
-            # The names of new created variable nodes should match the ones
-            # of the original nodes.
+        elif in_name in cut_var_names:
             ordered_ins.append(cut_var_map[in_name])
+        else:
+            # The remaining inputs are the ones cut from the original graph
+            # or the ones created inside the user-defined function. The names
+            # of new created variable nodes should match the ones of
+            # the original nodes.
+            assert in_name in gin_names
+            ordered_ins.append(input_syms[in_name])
 
     num_outputs = len(flat_out)
     num_states = len(state_names)

From efeedd6b4940056d26af52d6dce2900dc75da559 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 8 Jun 2018 01:01:01 +0000
Subject: [PATCH 078/135] Fix a bug of getting symbols.

---
 include/mxnet/c_api.h          | 2 +-
 python/mxnet/symbol/contrib.py | 2 +-
 src/c_api/c_api_symbolic.cc    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 15d213273ad4..528a76bbf7d0 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1070,7 +1070,7 @@ MXNET_DLL int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle *inputs,
  * \param inputs The nodes that connect to the subgraph.
  * \param input_size The number of such nodes.
  */
-MXNET_DLL int MXSymbolCutSubgraph(SymbolHandle sym, SymbolHandle **inputs,
+MXNET_DLL int MXSymbolCutSubgraph(SymbolHandle sym, SymbolHandle *inputs,
                                   int *input_size);
 
 /*!
diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 3c437f283f7b..3d26c8197764 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -118,7 +118,7 @@ def _cut_subgraph(subg):
 
     syms = []
     for i in range(num_handles.value):
-        s = Symbol(handles[i])
+        s = Symbol(SymbolHandle(handles[i]))
         syms.append(s)
     return syms
 
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index dd37746e5b23..98ce79959798 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -364,7 +364,7 @@ int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle *input_arr, int *inpu
   API_END_HANDLE_ERROR();
 }
 
-int MXSymbolCutSubgraph(SymbolHandle sym, SymbolHandle **input_symbols,
+int MXSymbolCutSubgraph(SymbolHandle sym, SymbolHandle *input_symbols,
                         int *input_size) {
   // Given a graph, we want to fetch the nodes that have been marked as part of
   // a subgraph.

From 28fe469d16129f672e88b0a1a4bc757119c8c241 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 8 Jun 2018 19:16:02 +0000
Subject: [PATCH 079/135] copy var nodes.

---
 python/mxnet/symbol/contrib.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 3d26c8197764..436c2543f881 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -21,6 +21,7 @@
 import math
 import ctypes
 import re
+import copy
 
 from .random import uniform
 from .symbol import Symbol
@@ -268,12 +269,11 @@ def check_data(inputs, in_type, msg):
         elif in_name in cut_var_names:
             ordered_ins.append(cut_var_map[in_name])
         else:
-            # The remaining inputs are the ones cut from the original graph
-            # or the ones created inside the user-defined function. The names
-            # of new created variable nodes should match the ones of
-            # the original nodes.
+            # The remaining inputs are the variable nodes created inside the UDF.
+            # The subgraph can't have nodes shared with the main graph. As such,
+            # we need to make a copy of these variable nodes.
             assert in_name in gin_names
-            ordered_ins.append(input_syms[in_name])
+            ordered_ins.append(copy.deepcopy(input_syms[in_name]))
 
     num_outputs = len(flat_out)
     num_states = len(state_names)

From cf91c59526a079f84ffe033d0d2085aa33a1e613 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 13 Jun 2018 22:02:32 +0000
Subject: [PATCH 080/135] Fix getting op states.

---
 src/imperative/cached_op.cc        |  3 ++-
 src/imperative/cached_op.h         |  2 +-
 src/operator/subgraph_op_common.cc | 25 ++++---------------------
 src/operator/subgraph_op_common.h  |  4 ++--
 4 files changed, 9 insertions(+), 25 deletions(-)

diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index 2181c5cab871..5e48c5a26f7b 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -818,7 +818,7 @@ OpStatePtr CachedOp::DynamicForward(
   return op_state;
 }
 
-void CachedOp::Forward(
+OpStatePtr CachedOp::Forward(
     const std::shared_ptr<CachedOp>& op_ptr,
     const std::vector<NDArray*>& inputs,
     const std::vector<NDArray*>& outputs) {
@@ -858,6 +858,7 @@ void CachedOp::Forward(
         std::move(attrs), inputs, outputs, op_state,
         &save_inputs(), &save_outputs());
   }
+  return op_state;
 }
 
 
diff --git a/src/imperative/cached_op.h b/src/imperative/cached_op.h
index 370ef02b5f25..4f4dfdcc14dd 100644
--- a/src/imperative/cached_op.h
+++ b/src/imperative/cached_op.h
@@ -92,7 +92,7 @@ class CachedOp {
   std::vector<nnvm::NodeEntry> Gradient(
       const nnvm::NodePtr& node,
       const std::vector<nnvm::NodeEntry>& ograds) const;
-  void Forward(
+  OpStatePtr Forward(
       const std::shared_ptr<CachedOp>& op_ptr,
       const std::vector<NDArray*>& inputs,
       const std::vector<NDArray*>& outputs);
diff --git a/src/operator/subgraph_op_common.cc b/src/operator/subgraph_op_common.cc
index 815809c995cb..72dda17a3684 100644
--- a/src/operator/subgraph_op_common.cc
+++ b/src/operator/subgraph_op_common.cc
@@ -171,22 +171,6 @@ void LoopState::Forward(std::vector<NDArray> cinputs,
   for (size_t i = 0; i < outputs.size(); i++)
     outputs[i] = &coutputs[i];
 
-  if (is_recording) {
-    all_inputs.push_back(cinputs);
-    std::vector<NDArray> gradients(cinputs.size());
-    std::vector<NDArray *> input_ptrs(cinputs.size());
-    std::vector<NDArray *> gradient_ptrs(cinputs.size());
-    std::vector<mx_uint> grad_reqs(cinputs.size());
-    for (size_t i = 0; i < gradients.size(); i++) {
-      gradients[i] = NDArray(cinputs[i].shape(), cinputs[i].ctx(),
-                             true, cinputs[i].dtype());
-      input_ptrs[i] = &cinputs[i];
-      gradient_ptrs[i] = &gradients[i];
-      grad_reqs[i] = kWriteTo;
-    }
-    Imperative::Get()->MarkVariables(input_ptrs, grad_reqs, gradient_ptrs);;
-  }
-
   std::vector<std::pair<std::string, std::string> > kwargs;
   kwargs.push_back(std::pair<std::string, std::string>("inline_limit", "0"));
   // Get input names.
@@ -197,11 +181,13 @@ void LoopState::Forward(std::vector<NDArray> cinputs,
   // called. Currently, CachedOp allocates memory each time Forward is called.
   // I need to fix this once the PR for static memory allocation in CachedOp is
   // merged. https://github.com/apache/incubator-mxnet/pull/10817
-  op->Forward(nullptr, inputs, outputs);
+  OpStatePtr state = op->Forward(nullptr, inputs, outputs);
 
   if (is_recording) {
+    all_inputs.push_back(cinputs);
     all_outputs.push_back(coutputs);
     iter_ops.push_back(op);
+    all_states.push_back(state);
   }
 
   Imperative::Get()->set_is_recording(orig_is_record);
@@ -240,10 +226,7 @@ void LoopState::Backward(int iter_no,
   for (size_t i = 0; i < igrads.size(); i++)
     outputs.push_back(&igrads[i]);
   CHECK_EQ(outputs.size(), op->num_inputs());
-
-  CHECK(!Imperative::AGInfo::IsNone(all_outputs[iter_no][0]));
-  const nnvm::NodeEntry &node_entry = all_outputs[iter_no][0].entry();
-  OpStatePtr state = Imperative::AGInfo::Get(node_entry.node).state;
+  auto state = all_states[iter_no];
   op->Backward(false, state, inputs, req, outputs);
 }
 
diff --git a/src/operator/subgraph_op_common.h b/src/operator/subgraph_op_common.h
index f65eb8ca489c..c3832d53ebf5 100644
--- a/src/operator/subgraph_op_common.h
+++ b/src/operator/subgraph_op_common.h
@@ -66,8 +66,8 @@ class LoopState {
   // They also contain the Op state for each CachedOp.
   std::vector<std::vector<NDArray> > all_outputs;
   std::vector<std::vector<NDArray> > all_inputs;
-  std::vector<std::vector<NDArray> > all_gradients;
   std::vector<CachedOpPtr> iter_ops;
+  std::vector<OpStatePtr> all_states;
   Symbol subgraph_sym;
   nnvm::Graph subgraph;
 
@@ -88,7 +88,7 @@ class LoopState {
   void Cleanup() {
     all_outputs.clear();
     all_inputs.clear();
-    all_gradients.clear();
+    all_states.clear();
     iter_ops.clear();
   }
 };

From f2edf2a412e6e8a854fb8e02260b3bfe8afad512 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 13 Jun 2018 22:15:41 +0000
Subject: [PATCH 081/135] fix lint error.

---
 src/operator/subgraph_op_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/subgraph_op_common.h b/src/operator/subgraph_op_common.h
index c3832d53ebf5..894bc799c12c 100644
--- a/src/operator/subgraph_op_common.h
+++ b/src/operator/subgraph_op_common.h
@@ -72,7 +72,7 @@ class LoopState {
   nnvm::Graph subgraph;
 
  public:
-  LoopState(const Symbol &g) {
+  explicit LoopState(const Symbol &g) {
     this->subgraph_sym = g;
     this->subgraph.outputs = g.outputs;
   }

From a35899fe490219fcc9e0f9ce9ecd1585c2a0094d Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 13 Jun 2018 22:21:51 +0000
Subject: [PATCH 082/135] address comments.

---
 python/mxnet/ndarray/contrib.py    | 1 -
 python/mxnet/symbol/contrib.py     | 4 ++--
 src/operator/control_flow.cc       | 5 +----
 src/operator/subgraph_op_common.cc | 2 --
 4 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/python/mxnet/ndarray/contrib.py b/python/mxnet/ndarray/contrib.py
index b5839cd457be..c82816e1d5af 100644
--- a/python/mxnet/ndarray/contrib.py
+++ b/python/mxnet/ndarray/contrib.py
@@ -171,7 +171,6 @@ def check_input(inputs, in_type, msg):
             "init_states should be an NDArray or a list of NDArrays")
 
     not_data_list = isinstance(data, ndarray.NDArray)
-    not_state_list = isinstance(init_states, ndarray.NDArray)
     num_iters = data.shape[0] if not_data_list else data[0].shape[0]
     states = init_states
     outputs = []
diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 436c2543f881..5e76587fd2cf 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -191,9 +191,9 @@ def check_data(inputs, in_type, msg):
             is_NDArray_or_list = isinstance(inputs, in_type)
         assert is_NDArray_or_list, msg
 
-    check_data(data, symbol.Symbol, "data should be an NDArray or a list of NDArrays")
+    check_data(data, symbol.Symbol, "data should be a symbol or a list of symbols")
     check_data(init_states, symbol.Symbol,
-            "init_states should be an NDArray or a list of NDArrays")
+            "init_states should be a symbol or a list of symbols")
     not_state_list = isinstance(init_states, symbol.Symbol)
 
     # TODO(zhengda) If the input python function references to the symbols outside
diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index c42aca0944d9..e5c1a943906b 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -34,7 +34,6 @@ namespace op {
 
 struct ForeachParam : public dmlc::Parameter<ForeachParam> {
   int num_args;
-  int dim;
   int num_outputs;
   int num_out_data;
   nnvm::Tuple<dim_t> in_state_locs;
@@ -42,8 +41,6 @@ struct ForeachParam : public dmlc::Parameter<ForeachParam> {
   DMLC_DECLARE_PARAMETER(ForeachParam) {
     DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
     .describe("Number of inputs.");
-    DMLC_DECLARE_FIELD(dim).set_default(1)
-    .describe("the dimension of the input array to iterate.");
     DMLC_DECLARE_FIELD(num_outputs)
     .describe("The number of outputs of the subgraph.");
     DMLC_DECLARE_FIELD(num_out_data)
@@ -73,7 +70,7 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
                                 const std::vector<NDArray>& outputs) {
   ForeachState &state = state_ptr.get_state<ForeachState>();
   const ForeachParam& params = state.params;
-  size_t iter_dim = 0;
+  const size_t iter_dim = 0;
   CHECK_EQ(outputs.size(), (size_t) params.num_outputs);
   CHECK_GT(params.in_data_locs.ndim(), 0);
   size_t loc0 = params.in_data_locs[0];
diff --git a/src/operator/subgraph_op_common.cc b/src/operator/subgraph_op_common.cc
index 72dda17a3684..579aea915a1e 100644
--- a/src/operator/subgraph_op_common.cc
+++ b/src/operator/subgraph_op_common.cc
@@ -173,8 +173,6 @@ void LoopState::Forward(std::vector<NDArray> cinputs,
 
   std::vector<std::pair<std::string, std::string> > kwargs;
   kwargs.push_back(std::pair<std::string, std::string>("inline_limit", "0"));
-  // Get input names.
-  const auto& idx = subgraph.indexed_graph();
   // We don't have parameters for the cached op.
   CachedOpPtr op = std::make_shared<CachedOp>(subgraph_sym, kwargs);
   // TODO(zhengda) we need to avoid shape inference and memory plan whenever the op is

From 8c6aca0fb0f93e0f9a9c560dfd6e5b464969a87a Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 13 Jun 2018 22:22:59 +0000
Subject: [PATCH 083/135] fix lint error.

---
 src/nnvm/graph_editor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nnvm/graph_editor.cc b/src/nnvm/graph_editor.cc
index 499780869c35..c23fc2569697 100644
--- a/src/nnvm/graph_editor.cc
+++ b/src/nnvm/graph_editor.cc
@@ -81,4 +81,4 @@ bool CutGraphInputs(const std::vector<nnvm::NodeEntry *> &input_entries,
   return true;
 }
 
-}
+}  // namespace mxnet

From ccaf388a4f17c05a75b08d405b615eefcf871c1d Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 13 Jun 2018 22:38:00 +0000
Subject: [PATCH 084/135] simplify the execution of subgraph in the main
 thread.

---
 include/mxnet/op_attr_types.h        |  7 +++++-
 src/executor/attach_op_execs_pass.cc | 35 +++++++---------------------
 src/executor/exec_pass.h             |  1 -
 src/executor/graph_executor.cc       | 23 ++++--------------
 src/operator/control_flow.cc         |  6 +++++
 5 files changed, 24 insertions(+), 48 deletions(-)

diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h
index b78cbf518feb..2bb2462d4869 100644
--- a/include/mxnet/op_attr_types.h
+++ b/include/mxnet/op_attr_types.h
@@ -100,7 +100,12 @@ enum class ExecType {
    *  In current implementation, copy operator is specially handled by executor.
    *  This flag is used for special case treatment and future extension of different copy ops.
    */
-  kCrossDeviceCopy
+  kCrossDeviceCopy,
+  /*!
+   * \brief A subgraph execution should happen in the main thread, instead of
+   *  in the execution engine.
+   */
+  kSubgraphExec,
 };
 
 /*! \brief the dispatch mode of the operator */
diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index 4db1b7651fa0..72919d90c620 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -126,10 +126,6 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor {
     PostFCompute(is_gpu);
   }
 
-  bool HasSubgraph() const override {
-    return !attrs_.subgraphs.empty();
-  }
-
   ExecType exec_type() const override {
     return exec_type_;
   }
@@ -142,15 +138,14 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor {
     return state_;
   }
 
-  explicit StatefulComputeExecutor(const NodeAttrs& attrs, const OpStatePtr& state,
+  explicit StatefulComputeExecutor(const OpStatePtr& state,
                                    const FStatefulCompute& fcompute,
                                    ExecType exec_type,
                                    const std::vector<uint32_t> &mutate_idx)
-      : StorageFallbackOpExecutor(mutate_idx), attrs_(attrs),
+      : StorageFallbackOpExecutor(mutate_idx),
         state_(state), fcompute_(fcompute), exec_type_(exec_type) {}
 
  private:
-  NodeAttrs attrs_;
   OpStatePtr state_;
   FStatefulCompute fcompute_;
   ExecType exec_type_;
@@ -168,10 +163,6 @@ class StatefulComputeExExecutor : public OpExecutor {
     fcompute_(state_, op_ctx, in_array, req, out_array);
   }
 
-  bool HasSubgraph() const override {
-    return !attrs_.subgraphs.empty();
-  }
-
   void Setup() override {}
 
   ExecType exec_type() const override {
@@ -186,13 +177,12 @@ class StatefulComputeExExecutor : public OpExecutor {
     return state_;
   }
 
-  explicit StatefulComputeExExecutor(const NodeAttrs& attrs, const OpStatePtr& state,
+  explicit StatefulComputeExExecutor(const OpStatePtr& state,
                                      const FStatefulComputeEx& fcompute,
                                      ExecType exec_type)
-      : attrs_(attrs), state_(state), fcompute_(fcompute), exec_type_(exec_type) {}
+      : state_(state), fcompute_(fcompute), exec_type_(exec_type) {}
 
  private:
-  NodeAttrs attrs_;
   OpStatePtr state_;
   FStatefulComputeEx fcompute_;
   ExecType exec_type_;
@@ -217,10 +207,6 @@ class FComputeExecutor : public StorageFallbackOpExecutor {
     return exec_type_;
   }
 
-  bool HasSubgraph() const override {
-    return !attrs_.subgraphs.empty();
-  }
-
   explicit FComputeExecutor(const NodeAttrs& attrs, FCompute fcompute,
                             ExecType exec_type, const std::vector<uint32_t> &mutate_idx)
       : StorageFallbackOpExecutor(mutate_idx),
@@ -246,10 +232,6 @@ class FComputeExExecutor : public OpExecutor {
 
   void Setup() override {}
 
-  bool HasSubgraph() const override {
-    return !attrs_.subgraphs.empty();
-  }
-
   ExecType exec_type() const override {
     return exec_type_;
   }
@@ -311,15 +293,14 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, size_t i) {
         op, "FStatefulComputeEx", vctx[i]);
     // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
     if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
-      ret[i] = std::make_shared<StatefulComputeExExecutor>(inode.source->attrs, state,
-                                                           fcompute_ex, exec_type);
+      ret[i] = std::make_shared<StatefulComputeExExecutor>(state, fcompute_ex, exec_type);
     } else {
       FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
           op, "FStatefulCompute", vctx[i]);
       CHECK(fcompute != nullptr)
           << "One of FStatefulCompute and FStatefulComputeEx must be registered "
           << "for stateful operator " << op->name;
-      ret[i] = std::make_shared<StatefulComputeExecutor>(inode.source->attrs, state, fcompute,
+      ret[i] = std::make_shared<StatefulComputeExecutor>(state, fcompute,
                                                          exec_type, mutate_index);
     }
   } else if (is_layer_backward.get(op, false)) {
@@ -331,7 +312,7 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, size_t i) {
         op, "FStatefulComputeEx", vctx[i]);
     // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
     if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
-      ret[i] = std::make_shared<StatefulComputeExExecutor>(inode.source->attrs,
+      ret[i] = std::make_shared<StatefulComputeExExecutor>(
           ret[fwd_id].get()->state(), fcompute_ex, exec_type);
     } else {
       FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
@@ -339,7 +320,7 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, size_t i) {
       CHECK(fcompute != nullptr)
           << "One of FStatefulCompute and FStatefulComputeEx must be registered "
           << "for stateful operator " << op->name;
-      ret[i] = std::make_shared<StatefulComputeExecutor>(inode.source->attrs,
+      ret[i] = std::make_shared<StatefulComputeExecutor>(
           ret[fwd_id].get()->state(), fcompute, exec_type, mutate_index);
     }
   } else {
diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h
index 3cc7e73c39d0..26a249118940 100644
--- a/src/executor/exec_pass.h
+++ b/src/executor/exec_pass.h
@@ -64,7 +64,6 @@ class OpExecutor {
   OpContext op_ctx;
   /*! \brief virtual destructor */
   virtual ~OpExecutor() {}
-  virtual bool HasSubgraph() const = 0;
   /*!
    * \brief Setup the executor for given NDArray member
    * this can be called multiple times if NDArray changed during reshape.
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 5616000f2743..7386de4d12e3 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -1490,11 +1490,7 @@ void GraphExecutor::BulkTrainingOpSegs(size_t total_num_nodes) {
     // check if the segment relies on external input, or exceeds maxinum number of node,
     // or requires async ops
     if (node->is_variable() || nid - topo_start > num_nodes_threshold ||
-        op_node.exec->exec_type() != ExecType::kSync ||
-        // If the node has a subgraph, we shouldn't add it to the segment.
-        // We'll execute the node separately from other nodes.
-        // CreateCachedSegOpr creates a segment excluding nodes with subgraphs.
-        op_node.exec->HasSubgraph()) {
+        op_node.exec->exec_type() != ExecType::kSync) {
       // create a new segment for the previous nodes if the current one cannot be bulked
       cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, nid);
       topo_start = nid + 1;
@@ -1519,11 +1515,7 @@ void GraphExecutor::BulkTrainingOpSegs(size_t total_num_nodes) {
       continue;
     }
     if (idx[nid].source->is_variable() || nid - topo_start > num_nodes_threshold ||
-        op_node.exec->exec_type() != ExecType::kSync ||
-        // If the node has a subgraph, we shouldn't add it to the segment.
-        // We'll execute the node separately from other nodes.
-        // CreateCachedSegOpr creates a segment excluding nodes with subgraphs.
-        op_node.exec->HasSubgraph()) {
+        op_node.exec->exec_type() != ExecType::kSync) {
       cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, nid);
       topo_start = nid + 1;
     } else {
@@ -1557,11 +1549,7 @@ void GraphExecutor::BulkInferenceOpSegs() {
     // Variables do not need to be segmented at inference time.
     if (node->is_variable()) continue;
 
-    if (op_node.exec->exec_type() != ExecType::kSync ||
-        // If the node has a subgraph, we shouldn't add it to the segment.
-        // We'll execute the node separately from other nodes.
-        // CreateCachedSegOpr creates a segment excluding nodes with subgraphs.
-        op_node.exec->HasSubgraph()) {
+    if (op_node.exec->exec_type() != ExecType::kSync) {
       cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, nid);
       topo_start = nid + 1;
     }
@@ -1629,7 +1617,7 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
       CHECK_EQ(opnode.exec->in_array.size(), 1U);
       CHECK_EQ(opnode.exec->out_array.size(), 1U);
       CopyFromTo(opnode.exec->in_array[0], &(opnode.exec->out_array[0]));
-    } else if (opnode.exec->HasSubgraph()) {
+    } else if (opnode.exec->exec_type() == ExecType::kSubgraphExec) {
       // If the node contains a subgraph, we can't execute it in the engine.
       opnode.exec->Run(opnode.exec->op_ctx.run_ctx, false);
     } else if (opnode.cached_opr != nullptr) {
@@ -1666,9 +1654,6 @@ GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start,
     OpNode& op_node = op_nodes_[nid];
     if (op_node.skip_exec_node) continue;
     if (inode.source->is_variable()) continue;
-    // We shouldn't add control flow operators to a segment.
-    // We can't execute these operators in the engine.
-    if (op_node.exec->HasSubgraph()) return ret;
     if (op_node.exec->exec_type() != ExecType::kSync) {
       return ret;
     }
diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index e5c1a943906b..7e26da94c42e 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -390,6 +390,9 @@ NNVM_REGISTER_OP(_foreach)
 // Foreach operator works like an executor. Its code will always run on CPU.
 // So the same code can be registered for both CPU and GPU.
 .set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", ForeachComputeExCPU)
+.set_attr<FExecType>("FExecType", [](const NodeAttrs& attrs) {
+  return ExecType::kSubgraphExec;
+})
 .set_attr<std::string>("key_var_num_args", "num_args")
 .add_argument("fn", "Symbol", "Input graph.")
 .add_argument("data", "NDArray-or-Symbol[]",
@@ -405,6 +408,9 @@ NNVM_REGISTER_OP(_backward_foreach)
   const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
   return params.num_args - 1;
   })
+.set_attr<FExecType>("FExecType", [](const NodeAttrs& attrs) {
+  return ExecType::kSubgraphExec;
+})
 .set_attr<FInferStorageType>("FInferStorageType", BackwardForeachStorageType)
 .set_attr_parser(ParamParser<ForeachParam>)
 .set_attr<bool>("TIsLayerOpBackward", true)

From 25cf8ac0d8938bd1db6643a9de7615599e80d3e8 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 13 Jun 2018 22:47:48 +0000
Subject: [PATCH 085/135] fix lint error.

---
 python/mxnet/ndarray/contrib.py | 8 +++++---
 python/mxnet/symbol/contrib.py  | 8 +++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/python/mxnet/ndarray/contrib.py b/python/mxnet/ndarray/contrib.py
index c82816e1d5af..3f8f20a73919 100644
--- a/python/mxnet/ndarray/contrib.py
+++ b/python/mxnet/ndarray/contrib.py
@@ -168,7 +168,7 @@ def check_input(inputs, in_type, msg):
 
     check_input(data, ndarray.NDArray, "data should be an NDArray or a list of NDArrays")
     check_input(init_states, ndarray.NDArray,
-            "init_states should be an NDArray or a list of NDArrays")
+                "init_states should be an NDArray or a list of NDArrays")
 
     not_data_list = isinstance(data, ndarray.NDArray)
     num_iters = data.shape[0] if not_data_list else data[0].shape[0]
@@ -183,8 +183,10 @@ def check_input(inputs, in_type, msg):
         outs = _as_list(outs)
         outputs.append(outs)
     outputs = zip(*outputs)
-    for j, out in enumerate(outputs):
-        outputs[j] = ndarray.op.stack(*out)
+    tmp_outputs = []
+    for out in outputs:
+        tmp_outputs.append(ndarray.op.stack(*out))
+    outputs = tmp_outputs
 
     if not_data_list:
         outputs = outputs[0]
diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 5e76587fd2cf..469b937cc151 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -20,7 +20,6 @@
 """Contrib Symbol API of MXNet."""
 import math
 import ctypes
-import re
 import copy
 
 from .random import uniform
@@ -192,8 +191,7 @@ def check_data(inputs, in_type, msg):
         assert is_NDArray_or_list, msg
 
     check_data(data, symbol.Symbol, "data should be a symbol or a list of symbols")
-    check_data(init_states, symbol.Symbol,
-            "init_states should be a symbol or a list of symbols")
+    check_data(init_states, symbol.Symbol, "init_states should be a symbol or a list of symbols")
     not_state_list = isinstance(init_states, symbol.Symbol)
 
     # TODO(zhengda) If the input python function references to the symbols outside
@@ -212,9 +210,9 @@ def check_data(inputs, in_type, msg):
         sym_out, sym_states = body(in_eles, states)
 
         check_data(sym_out, symbol.Symbol,
-                "the output should be an NDArray or a list of NDArrays")
+                   "the output should be an NDArray or a list of NDArrays")
         check_data(sym_states, symbol.Symbol,
-                "the output states should be an NDArray or a list of NDArrays")
+                   "the output states should be an NDArray or a list of NDArrays")
         if isinstance(sym_states, list):
             assert isinstance(init_states, list) and len(sym_states) == len(init_states), \
                     "the number of output states (%d) should be the same as input states (%d)" \

From 51de14c206b5de96eb3306633a6a8e0e814870f2 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 13 Jun 2018 23:01:39 +0000
Subject: [PATCH 086/135] avoid waiting for computation in each iteration.

---
 src/operator/control_flow.cc | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index 7e26da94c42e..9e75c22e9cb6 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -157,12 +157,6 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
     }
 
     state.Forward(subg_inputs, req, *subg_out_curr, ctx.need_grad);
-    // We need to wait for the iteration to complete before executing
-    // the next one or return from the loop. In this way, we can reuse
-    // the memory in the subgraph.
-    for (size_t j = 0; j < subg_out_curr->size(); j++) {
-      (*subg_out_curr)[j].WaitToRead();
-    }
   }
 }
 
@@ -234,13 +228,6 @@ static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
 
     state.Backward(iter_num, ograds, iter_req, igrads);
 
-    // We need to wait for the iteration to complete before executing
-    // the next one or return from the loop. In this way, we can reuse
-    // the memory in the subgraph.
-    for (size_t i = 0; i < igrads.size(); i++) {
-      igrads[i].WaitToRead();
-    }
-
     size_t num_states = ograds.size() - num_output_data;
     for (size_t i = 0; i < num_states; i++) {
       size_t loc = params.in_state_locs[i];

From 3eb0bc1109001f822df0afc10ce773c4c901ce83 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Thu, 14 Jun 2018 00:56:16 +0000
Subject: [PATCH 087/135] reuse cached op for inference.

---
 src/operator/subgraph_op_common.cc     | 17 +++++++++++++++--
 src/operator/subgraph_op_common.h      |  5 +++++
 tests/python/unittest/test_operator.py |  5 ++++-
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/src/operator/subgraph_op_common.cc b/src/operator/subgraph_op_common.cc
index 579aea915a1e..ab301bf3d917 100644
--- a/src/operator/subgraph_op_common.cc
+++ b/src/operator/subgraph_op_common.cc
@@ -173,8 +173,21 @@ void LoopState::Forward(std::vector<NDArray> cinputs,
 
   std::vector<std::pair<std::string, std::string> > kwargs;
   kwargs.push_back(std::pair<std::string, std::string>("inline_limit", "0"));
-  // We don't have parameters for the cached op.
-  CachedOpPtr op = std::make_shared<CachedOp>(subgraph_sym, kwargs);
+  kwargs.push_back(std::pair<std::string, std::string>("static_alloc", "1"));
+  CachedOpPtr op;
+  // If we need to run backward, we need to keep all computation results
+  // for backward.
+  if (is_recording) {
+    op = std::make_shared<CachedOp>(subgraph_sym, kwargs);
+  } else if (iter_ops.empty()) {
+    // If we don't need to run backward and this is the first time of
+    // running the iteration, we need to create a new cached op.
+    op = std::make_shared<CachedOp>(subgraph_sym, kwargs);
+    iter_ops.push_back(op);
+  } else {
+    // If we already have a cached op, we can just reuse it.
+    op = iter_ops[0];
+  }
   // TODO(zhengda) we need to avoid shape inference and memory plan whenever the op is
   // called. Currently, CachedOp allocates memory each time Forward is called.
   // I need to fix this once the PR for static memory allocation in CachedOp is
diff --git a/src/operator/subgraph_op_common.h b/src/operator/subgraph_op_common.h
index 894bc799c12c..60b97b4a979f 100644
--- a/src/operator/subgraph_op_common.h
+++ b/src/operator/subgraph_op_common.h
@@ -66,6 +66,11 @@ class LoopState {
   // They also contain the Op state for each CachedOp.
   std::vector<std::vector<NDArray> > all_outputs;
   std::vector<std::vector<NDArray> > all_inputs;
+  // For inference, there should be only one cached op because we
+  // want to share the memory in iterations.
+  // For training, each iteration has a cached op because each iteration
+  // needs to maintain a set of memory buffers for all computation states,
+  // which will be used in the backward.
   std::vector<CachedOpPtr> iter_ops;
   std::vector<OpStatePtr> all_states;
   Symbol subgraph_sym;
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 0ac9d458473d..4a2fb24302c5 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5998,7 +5998,10 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
             name = name[1:]
             gin_order.append(int(name))
 
-        e = out.bind(ctx=default_context(), args=arg_dict, args_grad=arg_grad_dict)
+        if is_train:
+            e = out.bind(ctx=default_context(), args=arg_dict, args_grad=arg_grad_dict)
+        else:
+            e = out.bind(ctx=default_context(), args=arg_dict)
         e.forward(is_train=is_train)
         if (is_train):
             # backward

From 4a0ff21ab6dd255148b3cc4541c8936eac342a05 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Thu, 14 Jun 2018 01:11:43 +0000
Subject: [PATCH 088/135] share memory across mini-batches.

---
 src/operator/control_flow.cc           |  1 -
 tests/python/unittest/test_operator.py | 21 ++++++++++++++-------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index 9e75c22e9cb6..e24629520a92 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -235,7 +235,6 @@ static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
       ograds[i + num_output_data] = igrads[loc];
     }
   }
-  state.Cleanup();
 }
 
 static bool ForeachShape(const nnvm::NodeAttrs& attrs,
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 4a2fb24302c5..39b270eaf1fd 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5958,7 +5958,7 @@ def step3(in1, states, free):
 
     def verify_foreach(step, in_syms, state_syms, free_syms,
             in_arrs, init_states, frees, out_grads, is_train=True,
-            free_vars_func=None):
+            free_vars_func=None, num_iters=1):
         step_sym = lambda in_syms, state_syms : step(in_syms, state_syms, free_syms)
         res, states = mx.sym.contrib.foreach(step_sym, in_syms, state_syms)
         out = _as_list(res)
@@ -6002,12 +6002,15 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
             e = out.bind(ctx=default_context(), args=arg_dict, args_grad=arg_grad_dict)
         else:
             e = out.bind(ctx=default_context(), args=arg_dict)
-        e.forward(is_train=is_train)
-        if (is_train):
-            # backward
-            tmp_grads = out_grads[0][:]
-            tmp_grads.extend(out_grads[1])
-            e.backward(tmp_grads)
+        # the inputs to forward and backward are the same so forward and backward
+        # should always return the same outputs.
+        for i in range(num_iters):
+            e.forward(is_train=is_train)
+            if (is_train):
+                # backward
+                tmp_grads = out_grads[0][:]
+                tmp_grads.extend(out_grads[1])
+                e.backward(tmp_grads)
 
         # Below we use imperative to reimplement foreach and compute its gradients.
         res = []
@@ -6079,6 +6082,10 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
             lambda frees : [frees[0] + frees[1]])
     verify_foreach(step1, v3, [v4], [v5 + v6], arrs, states, frees1, out_grads, False,
             lambda frees : [frees[0] + frees[1]])
+    verify_foreach(step1, v3, [v4], [v5 + v6], arrs, states, frees1, out_grads, True,
+            lambda frees : [frees[0] + frees[1]], 5)
+    verify_foreach(step1, v3, [v4], [v5 + v6], arrs, states, frees1, out_grads, False,
+            lambda frees : [frees[0] + frees[1]], 5)
 
     frees = [mx.nd.random.uniform(shape=(2))]
     arrs = mx.nd.random.uniform(shape=(2, 2))

From 8766cb2b02e3c40bbac604f08f46a22e68405554 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Thu, 14 Jun 2018 21:42:01 +0000
Subject: [PATCH 089/135] reuse memory.

reuse memory between iterations in inference.
reuse memory between mini-batches in training.
---
 src/operator/control_flow.cc       |  3 ++-
 src/operator/subgraph_op_common.cc | 23 ++++++++++++++---------
 src/operator/subgraph_op_common.h  |  4 ++--
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index e24629520a92..03c3c0dbe8a9 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -156,7 +156,7 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
       }
     }
 
-    state.Forward(subg_inputs, req, *subg_out_curr, ctx.need_grad);
+    state.Forward(i, subg_inputs, req, *subg_out_curr, ctx.need_grad);
   }
 }
 
@@ -235,6 +235,7 @@ static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
       ograds[i + num_output_data] = igrads[loc];
     }
   }
+  state.Cleanup();
 }
 
 static bool ForeachShape(const nnvm::NodeAttrs& attrs,
diff --git a/src/operator/subgraph_op_common.cc b/src/operator/subgraph_op_common.cc
index ab301bf3d917..fb1bc365e890 100644
--- a/src/operator/subgraph_op_common.cc
+++ b/src/operator/subgraph_op_common.cc
@@ -151,7 +151,8 @@ bool InferSubgraphBackwardStorage(const nnvm::Symbol &subgraph,
   return true;
 }
 
-void LoopState::Forward(std::vector<NDArray> cinputs,
+void LoopState::Forward(int iter_no,
+                        std::vector<NDArray> cinputs,
                         const std::vector<OpReqType>& req,
                         std::vector<NDArray> coutputs,
                         bool is_recording) {
@@ -175,18 +176,23 @@ void LoopState::Forward(std::vector<NDArray> cinputs,
   kwargs.push_back(std::pair<std::string, std::string>("inline_limit", "0"));
   kwargs.push_back(std::pair<std::string, std::string>("static_alloc", "1"));
   CachedOpPtr op;
-  // If we need to run backward, we need to keep all computation results
-  // for backward.
-  if (is_recording) {
+  if (is_recording && iter_ops.size() > (size_t) iter_no)
+    op = iter_ops[iter_no];
+  else if (!is_recording && iter_ops.size() == 1)
+    op = iter_ops[0];
+
+  // If we need to run backward and we don't have a cached op for this iteration,
+  // we create one for this iteration.
+  if (is_recording && op == nullptr) {
     op = std::make_shared<CachedOp>(subgraph_sym, kwargs);
-  } else if (iter_ops.empty()) {
+    CHECK_EQ(iter_ops.size(), iter_no);
+    iter_ops.push_back(op);
+  } else if (op == nullptr) {
     // If we don't need to run backward and this is the first time of
     // running the iteration, we need to create a new cached op.
     op = std::make_shared<CachedOp>(subgraph_sym, kwargs);
+    CHECK(iter_ops.empty());
     iter_ops.push_back(op);
-  } else {
-    // If we already have a cached op, we can just reuse it.
-    op = iter_ops[0];
   }
   // TODO(zhengda) we need to avoid shape inference and memory plan whenever the op is
   // called. Currently, CachedOp allocates memory each time Forward is called.
@@ -197,7 +203,6 @@ void LoopState::Forward(std::vector<NDArray> cinputs,
   if (is_recording) {
     all_inputs.push_back(cinputs);
     all_outputs.push_back(coutputs);
-    iter_ops.push_back(op);
     all_states.push_back(state);
   }
 
diff --git a/src/operator/subgraph_op_common.h b/src/operator/subgraph_op_common.h
index 60b97b4a979f..4243b6b0f9dc 100644
--- a/src/operator/subgraph_op_common.h
+++ b/src/operator/subgraph_op_common.h
@@ -82,7 +82,8 @@ class LoopState {
     this->subgraph.outputs = g.outputs;
   }
 
-  void Forward(std::vector<NDArray> cinputs,
+  void Forward(int iter_no,
+               std::vector<NDArray> cinputs,
                const std::vector<OpReqType>& req,
                std::vector<NDArray> coutputs,
                bool is_recording);
@@ -94,7 +95,6 @@ class LoopState {
     all_outputs.clear();
     all_inputs.clear();
     all_states.clear();
-    iter_ops.clear();
   }
 };
 

From 97b90740dd89f85a5b7eed6576db3c605967560e Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Thu, 14 Jun 2018 22:37:55 +0000
Subject: [PATCH 090/135] add tests for multiple batches.

---
 tests/python/unittest/test_operator.py | 97 ++++++++++++++++----------
 1 file changed, 59 insertions(+), 38 deletions(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 39b270eaf1fd..eefe7efbffd7 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -6202,6 +6202,7 @@ def sym_group(out):
         ret.extend(out[1])
         return mx.sym.Group(ret)
 
+    # Inputs
     data_arr = mx.nd.random.uniform(shape=(2, 2, 4))
     h_arr = mx.nd.random.uniform(shape=(2, 4))
     c_arr = mx.nd.random.uniform(shape=(2, 4))
@@ -6209,7 +6210,14 @@ def sym_group(out):
     h2h_warr = mx.nd.random.uniform(shape=(16, 4))
     i2h_barr = mx.nd.random.uniform(shape=(16))
     h2h_barr = mx.nd.random.uniform(shape=(16))
-
+    args1 = {'data': data_arr, 'h': h_arr, 'c': c_arr,
+            'i2h_weight': i2h_warr, 'h2h_weight': h2h_warr,
+            'i2h_bias': i2h_barr, 'h2h_bias': h2h_barr}
+    args2 = {'data': data_arr, 'h': h_arr, 'c': c_arr,
+            'mylstm_i2h_weight': i2h_warr, 'mylstm_h2h_weight': h2h_warr,
+            'mylstm_i2h_bias': i2h_barr, 'mylstm_h2h_bias': h2h_barr}
+
+    # gradients for the backward of the foreach symbol
     data_arr_grad1 = mx.nd.empty(data_arr.shape)
     h_arr_grad1 = mx.nd.empty(h_arr.shape)
     c_arr_grad1 = mx.nd.empty(c_arr.shape)
@@ -6217,28 +6225,11 @@ def sym_group(out):
     h2h_warr_grad1 = mx.nd.empty(h2h_warr.shape)
     i2h_barr_grad1 = mx.nd.empty(i2h_barr.shape)
     h2h_barr_grad1 = mx.nd.empty(h2h_barr.shape)
-    out = mx.sym.contrib.foreach(step, data, [init_h, init_c])
-    out = sym_group(out)
-    js_1 = out.tojson()
-    out = mx.sym.load_json(js_1)
-    js_2 = out.tojson()
-    assert js_1 == js_2
-
-    e1 = out.bind(ctx=default_context(),
-                  args={'data': data_arr, 'h': h_arr, 'c': c_arr,
-                        'i2h_weight': i2h_warr, 'h2h_weight': h2h_warr,
-                        'i2h_bias': i2h_barr, 'h2h_bias': h2h_barr},
-                  args_grad={'data': data_arr_grad1, 'h': h_arr_grad1, 'c': c_arr_grad1,
-                             'i2h_weight': i2h_warr_grad1, 'h2h_weight': h2h_warr_grad1,
-                             'i2h_bias': i2h_barr_grad1, 'h2h_bias': h2h_barr_grad1})
-    e1.forward(is_train=True)
-    outputs1 = e1.outputs
-    # backward
-    out_grads = []
-    for arr in e1.outputs:
-        out_grads.append(mx.nd.random.uniform(-10, 10, arr.shape))
-    e1.backward(out_grads)
+    args_grad1 = {'data': data_arr_grad1, 'h': h_arr_grad1, 'c': c_arr_grad1,
+            'i2h_weight': i2h_warr_grad1, 'h2h_weight': h2h_warr_grad1,
+            'i2h_bias': i2h_barr_grad1, 'h2h_bias': h2h_barr_grad1}
 
+    # gradients for the backward of the unrolled symbol.
     data_arr_grad2 = mx.nd.empty(data_arr.shape)
     h_arr_grad2 = mx.nd.empty(h_arr.shape)
     c_arr_grad2 = mx.nd.empty(c_arr.shape)
@@ -6246,6 +6237,20 @@ def sym_group(out):
     h2h_warr_grad2 = mx.nd.empty(h2h_warr.shape)
     i2h_barr_grad2 = mx.nd.empty(i2h_barr.shape)
     h2h_barr_grad2 = mx.nd.empty(h2h_barr.shape)
+    args_grad2 = {'data': data_arr_grad2, 'h': h_arr_grad2, 'c': c_arr_grad2,
+            'mylstm_i2h_weight': i2h_warr_grad2, 'mylstm_h2h_weight': h2h_warr_grad2,
+            'mylstm_i2h_bias': i2h_barr_grad2, 'mylstm_h2h_bias': h2h_barr_grad2}
+
+    # Symbol of running LSTM with foreach.
+    out = mx.sym.contrib.foreach(step, data, [init_h, init_c])
+    out = sym_group(out)
+    js_1 = out.tojson()
+    out = mx.sym.load_json(js_1)
+    js_2 = out.tojson()
+    assert js_1 == js_2
+    e1 = out.bind(ctx=default_context(), args=args1, args_grad=args_grad1)
+
+    # Symbol of running unrolled LSTM.
     lstm = mx.rnn.LSTMCell(4, prefix='mylstm_')
     h = init_h
     c = init_c
@@ -6259,22 +6264,38 @@ def sym_group(out):
     out = mx.sym.load_json(js_1)
     js_2 = out.tojson()
     assert js_1 == js_2
-
-    e2 = out.bind(ctx=default_context(),
-                  args={'data': data_arr, 'h': h_arr, 'c': c_arr,
-                        'mylstm_i2h_weight': i2h_warr, 'mylstm_h2h_weight': h2h_warr,
-                        'mylstm_i2h_bias': i2h_barr, 'mylstm_h2h_bias': h2h_barr},
-                  args_grad={'data': data_arr_grad2, 'h': h_arr_grad2, 'c': c_arr_grad2,
-                             'mylstm_i2h_weight': i2h_warr_grad2, 'mylstm_h2h_weight': h2h_warr_grad2,
-                             'mylstm_i2h_bias': i2h_barr_grad2, 'mylstm_h2h_bias': h2h_barr_grad2})
-    e2.forward(is_train=True)
-    outputs2 = e2.outputs
-    e2.backward(out_grads)
-
-    for i in range(len(outputs2)):
-        assert_almost_equal(outputs1[i].asnumpy(), outputs2[i].asnumpy(), rtol=0.001, atol=0.0001)
-    for i in range(len(e1.grad_arrays)):
-        assert_almost_equal(e1.grad_arrays[i].asnumpy(), e2.grad_arrays[i].asnumpy())
+    e2 = out.bind(ctx=default_context(), args=args2, args_grad=args_grad2)
+
+    for i in range(5):
+        out_grads = []
+        for arr in e1.outputs:
+            out_grads.append(mx.nd.random.uniform(-10, 10, arr.shape))
+
+        data_arr = mx.nd.random.uniform(shape=(2, 2, 4))
+        h_arr = mx.nd.random.uniform(shape=(2, 4))
+        c_arr = mx.nd.random.uniform(shape=(2, 4))
+        i2h_warr = mx.nd.random.uniform(shape=(16, 4))
+        h2h_warr = mx.nd.random.uniform(shape=(16, 4))
+        i2h_barr = mx.nd.random.uniform(shape=(16))
+        h2h_barr = mx.nd.random.uniform(shape=(16))
+
+        e1.forward(is_train=True, data = data_arr, h = h_arr, c = c_arr,
+            i2h_weight = i2h_warr, h2h_weight = h2h_warr,
+            i2h_bias = i2h_barr, h2h_bias = h2h_barr)
+        outputs1 = e1.outputs
+        e1.backward(out_grads)
+
+        e2.forward(is_train=True, data = data_arr, h = h_arr, c = c_arr,
+            mylstm_i2h_weight = i2h_warr, mylstm_h2h_weight = h2h_warr,
+            mylstm_i2h_bias = i2h_barr, mylstm_h2h_bias = h2h_barr)
+        outputs2 = e2.outputs
+        e2.backward(out_grads)
+
+        for i in range(len(outputs2)):
+            assert_almost_equal(outputs1[i].asnumpy(), outputs2[i].asnumpy(),
+                    rtol=0.001, atol=0.0001)
+        for i in range(len(e1.grad_arrays)):
+            assert_almost_equal(e1.grad_arrays[i].asnumpy(), e2.grad_arrays[i].asnumpy())
 
 
 @with_seed()

From e38b7f47f3dd58be5408cfb55a6e702413e36f26 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 15 Jun 2018 17:48:00 +0000
Subject: [PATCH 091/135] remove entry.

---
 include/mxnet/ndarray.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index 9e0f0b289d28..d3f44404fd82 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -703,10 +703,6 @@ class NDArray {
   NDArray MKLDNNDataReshape(const TShape &shape) const;
 #endif
 
-  const nnvm::NodeEntry &entry() const {
-    return entry_;
-  }
-
   /*!
    * \brief Save list of ndarray into the Stream.x
    * \param fo The stream of output.

From 198bcfb11b06891403a74f805afaa8401b360ae5 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sat, 16 Jun 2018 01:21:22 +0000
Subject: [PATCH 092/135] add benchmark for foreach.

---
 example/rnn/benchmark_rnn.py | 112 +++++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100644 example/rnn/benchmark_rnn.py

diff --git a/example/rnn/benchmark_rnn.py b/example/rnn/benchmark_rnn.py
new file mode 100644
index 000000000000..124437d4de98
--- /dev/null
+++ b/example/rnn/benchmark_rnn.py
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import subprocess
+import mxnet as mx
+from mxnet import gluon
+import time
+
+def get_gpus():
+    """
+    return a list of GPUs
+    """
+    try:
+        re = subprocess.check_output(["nvidia-smi", "-L"], universal_newlines=True)
+    except OSError:
+        return []
+    return range(len([i for i in re.split('\n') if 'GPU' in i]))
+
+class TestRNNLayer(gluon.HybridBlock):
+    def __init__(self, hidden_size, prefix=None, params=None):
+        super(TestRNNLayer, self).__init__(prefix=prefix, params=params)
+        self.cell = gluon.rnn.RNNCell(hidden_size, prefix='rnn_')
+
+    def hybrid_forward(self, F, inputs, states):
+        states = [states]
+        out, states = F.contrib.foreach(self.cell, inputs, states)
+        return out
+
+def test_contrib_rnn(batch_size, input_size, hidden_size, seq_len, ctx):
+    rnn_data = mx.nd.normal(loc=0, scale=1, shape=(seq_len, batch_size, input_size))
+    states = mx.nd.normal(loc=0, scale=1, shape=(batch_size, hidden_size))
+    num_batches = 20
+
+    # Imperative
+    layer1 = TestRNNLayer(hidden_size)
+    layer1.initialize(ctx=ctx)
+
+    # Hybridize
+    layer2 = TestRNNLayer(hidden_size)
+    layer2.initialize(ctx=ctx)
+    layer2.hybridize()
+
+    tic = time.time()
+    for i in range(num_batches):
+        res1 = layer1(rnn_data, states)
+        mx.nd.waitall()
+    print("Imperative inference takes " + str(time.time() - tic))
+
+    tic = time.time()
+    for i in range(num_batches):
+        res2 = layer2(rnn_data, states)
+        mx.nd.waitall()
+    print("Hybrid inference takes " + str(time.time() - tic))
+
+    #trainer = gluon.Trainer(layer1.collect_params(), 'sgd', {'learning_rate' : 0.03})
+    tic = time.time()
+    for i in range(num_batches):
+        with mx.autograd.record():
+            res1 = layer1(rnn_data, states)
+        res1.backward()
+        #trainer.step(batch_size)
+    print("Imperative training takes " + str(time.time() - tic))
+
+    #trainer = gluon.Trainer(layer2.collect_params(), 'sgd', {'learning_rate' : 0.03})
+    tic = time.time()
+    for i in range(num_batches):
+        with mx.autograd.record():
+            res2 = layer2(rnn_data, states)
+        res2.backward()
+        #trainer.step(batch_size)
+    print("Hybrid training takes " + str(time.time() - tic))
+
+    layer2.export("foreach_rnn")
+    symnet = mx.symbol.load('foreach_rnn-symbol.json')
+    # Inputs
+    args1 = {}
+    params = layer2.collect_params()
+    for key in params.keys():
+        args1[key] = params[key].data()
+    args1['data0'] = rnn_data
+    args1['data1'] = states
+    # gradients for the backward of the foreach symbol
+    args_grad1 = {}
+    for key in args1.keys():
+        args_grad1[key] = mx.nd.empty(args1[key].shape)
+    exe = symnet.bind(ctx=ctx, args=args1, args_grad=args_grad1)
+    tic = time.time()
+    for i in range(num_batches):
+        exe.forward(is_train=True)
+        exe.backward(res2)
+    print("Symbol training takes " + str(time.time() - tic))
+
+if __name__ == '__main__':
+    print("Benchmark in CPU:")
+    test_contrib_rnn(1, 100, 100, 100, mx.cpu(0))
+    if len(get_gpus()) > 0:
+        print("Benchmark in GPU:")
+        test_contrib_rnn(1, 100, 100, 100, mx.gpu(0))

From 811acb34d571b8dacf695fdd425aefa5c527d0df Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sat, 16 Jun 2018 22:41:01 +0000
Subject: [PATCH 093/135] benchmark large batch size.

---
 example/rnn/benchmark_rnn.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/example/rnn/benchmark_rnn.py b/example/rnn/benchmark_rnn.py
index 124437d4de98..18d6bf1ead5d 100644
--- a/example/rnn/benchmark_rnn.py
+++ b/example/rnn/benchmark_rnn.py
@@ -105,8 +105,12 @@ def test_contrib_rnn(batch_size, input_size, hidden_size, seq_len, ctx):
     print("Symbol training takes " + str(time.time() - tic))
 
 if __name__ == '__main__':
-    print("Benchmark in CPU:")
+    print("Benchmark in CPU (batch size: 1)")
     test_contrib_rnn(1, 100, 100, 100, mx.cpu(0))
+    print("Benchmark in CPU (batch size: 32)")
+    test_contrib_rnn(32, 100, 100, 100, mx.cpu(0))
     if len(get_gpus()) > 0:
-        print("Benchmark in GPU:")
+        print("Benchmark in GPU (batch size: 1)")
         test_contrib_rnn(1, 100, 100, 100, mx.gpu(0))
+        print("Benchmark in GPU (batch size: 32)")
+        test_contrib_rnn(32, 100, 100, 100, mx.gpu(0))

From 24fa83b98c04fbeca698b4b386d3b052523f5601 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sun, 17 Jun 2018 00:41:49 +0000
Subject: [PATCH 094/135] Fix the benchmark for GPU.

---
 example/rnn/benchmark_rnn.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/example/rnn/benchmark_rnn.py b/example/rnn/benchmark_rnn.py
index 18d6bf1ead5d..285239344a13 100644
--- a/example/rnn/benchmark_rnn.py
+++ b/example/rnn/benchmark_rnn.py
@@ -41,8 +41,8 @@ def hybrid_forward(self, F, inputs, states):
         return out
 
 def test_contrib_rnn(batch_size, input_size, hidden_size, seq_len, ctx):
-    rnn_data = mx.nd.normal(loc=0, scale=1, shape=(seq_len, batch_size, input_size))
-    states = mx.nd.normal(loc=0, scale=1, shape=(batch_size, hidden_size))
+    rnn_data = mx.nd.normal(loc=0, scale=1, shape=(seq_len, batch_size, input_size), ctx=ctx)
+    states = mx.nd.normal(loc=0, scale=1, shape=(batch_size, hidden_size), ctx=ctx)
     num_batches = 20
 
     # Imperative
@@ -96,7 +96,7 @@ def test_contrib_rnn(batch_size, input_size, hidden_size, seq_len, ctx):
     # gradients for the backward of the foreach symbol
     args_grad1 = {}
     for key in args1.keys():
-        args_grad1[key] = mx.nd.empty(args1[key].shape)
+        args_grad1[key] = mx.nd.empty(args1[key].shape, ctx=ctx)
     exe = symnet.bind(ctx=ctx, args=args1, args_grad=args_grad1)
     tic = time.time()
     for i in range(num_batches):

From 550e48a3e009c3e70aa97bece1c098e98e620690 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sun, 17 Jun 2018 00:42:24 +0000
Subject: [PATCH 095/135] address comments.

---
 python/mxnet/symbol/contrib.py     | 2 +-
 src/imperative/imperative_utils.h  | 4 ++--
 src/operator/subgraph_op_common.cc | 4 ----
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 469b937cc151..40f45878f1af 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -34,7 +34,7 @@
 from ..base import SymbolHandle, _as_list
 from ..attribute import AttrScope
 
-__all__ = ["rand_zipfian"]
+__all__ = ["rand_zipfian", "foreach"]
 
 def rand_zipfian(true_classes, num_sampled, range_max):
     """Draw random samples from an approximately log-uniform or Zipfian distribution.
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 5f21a08895e4..2331d7be155c 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -486,7 +486,7 @@ inline void PushOperator(const OpStatePtr& state,
 
     // For operators with subgraphs, we need to invoke them in the main thread
     // instead of the threaded engine.
-    if (!attrs.subgraphs.empty()) {
+    if (exec_type == ExecType::kSubgraphExec) {
       RunContext rctx{ctx, nullptr};
       run(rctx, engine::CallbackOnComplete());
     } else if (exec_type == ExecType::kSync) {
@@ -534,7 +534,7 @@ inline void PushOperator(const OpStatePtr& state,
         }
       };
 
-    if (!attrs.subgraphs.empty()) {
+    if (exec_type == ExecType::kSubgraphExec) {
       RunContext rctx{ctx, nullptr};
       run(rctx, engine::CallbackOnComplete());
     } else if (exec_type == ExecType::kSync) {
diff --git a/src/operator/subgraph_op_common.cc b/src/operator/subgraph_op_common.cc
index fb1bc365e890..000d72d8de96 100644
--- a/src/operator/subgraph_op_common.cc
+++ b/src/operator/subgraph_op_common.cc
@@ -194,10 +194,6 @@ void LoopState::Forward(int iter_no,
     CHECK(iter_ops.empty());
     iter_ops.push_back(op);
   }
-  // TODO(zhengda) we need to avoid shape inference and memory plan whenever the op is
-  // called. Currently, CachedOp allocates memory each time Forward is called.
-  // I need to fix this once the PR for static memory allocation in CachedOp is
-  // merged. https://github.com/apache/incubator-mxnet/pull/10817
   OpStatePtr state = op->Forward(nullptr, inputs, outputs);
 
   if (is_recording) {

From 97d0332c8482c6fcb95e9465da1ea7ce6ae323ad Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sun, 17 Jun 2018 01:34:42 +0000
Subject: [PATCH 096/135] update shape/dtype/storage inference.

---
 src/operator/control_flow.cc       |  47 ++++++++----
 src/operator/subgraph_op_common.cc | 117 ++++++++++++++++++-----------
 2 files changed, 107 insertions(+), 57 deletions(-)

diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index 03c3c0dbe8a9..2a6421021686 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -243,39 +243,55 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
                          std::vector<TShape> *out_shape) {
   const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
   CHECK_EQ(out_shape->size(), (size_t) params.num_outputs);
-  nnvm::ShapeVector shape_inputs = *in_shape;
+  CHECK_EQ(attrs.subgraphs.size(), 1U);
+  nnvm::Graph g;
+  g.outputs = attrs.subgraphs[0]->outputs;
+  const auto& idx = g.indexed_graph();
+  CHECK_EQ(idx.input_nodes().size(), in_shape->size());
+  CHECK_EQ(idx.outputs().size(), out_shape->size());
+
+  // Put the input and output shapes to the shape vector.
+  nnvm::ShapeVector shapes(idx.num_node_entries());
+  const auto &input_nids = idx.input_nodes();
+  CHECK_EQ(input_nids.size(), in_shape->size());
+  for (size_t i = 0; i < in_shape->size(); i++) {
+    auto eid = idx.entry_id(input_nids[i], 0);
+    shapes[eid] = in_shape->at(i);
+  }
+  CHECK_EQ(g.outputs.size(), out_shape->size());
+  for (size_t i = 0; i < out_shape->size(); i++) {
+    auto eid = idx.entry_id(g.outputs[i]);
+    shapes[eid] = out_shape->at(i);
+  }
   // foreach iterates over the first input NDArray over the first dimension.
   size_t loc0 = params.in_data_locs[0];
   size_t len = in_shape->at(loc0)[0];
   for (size_t i = 0; i < params.in_data_locs.ndim(); i++) {
     size_t loc = params.in_data_locs[i];
+    auto eid = idx.entry_id(input_nids[loc], 0);
     CHECK_EQ(len, in_shape->at(loc)[0]);
-    shape_inputs[loc] = TShape(in_shape->at(loc).begin() + 1, in_shape->at(loc).end());
+    shapes[eid] = TShape(in_shape->at(loc).begin() + 1, in_shape->at(loc).end());
   }
-  CHECK_EQ(attrs.subgraphs.size(), 1U);
-  nnvm::Graph g;
-  g.outputs = attrs.subgraphs[0]->outputs;
-  const auto& idx = g.indexed_graph();
-  CHECK_EQ(idx.input_nodes().size(), in_shape->size());
-  CHECK_EQ(idx.outputs().size(), out_shape->size());
-  imperative::CheckAndInferShape(&g, std::move(shape_inputs), true);
 
-  const auto& shapes = g.GetAttr<nnvm::ShapeVector>("shape");
+  // Infer shape of the graph.
+  g.attrs["shape"] = std::make_shared<dmlc::any>(std::move(shapes));
+  g = exec::InferShape(std::move(g));
+
+  const auto& shapes1 = g.GetAttr<nnvm::ShapeVector>("shape");
   // Inferring the shape in the subgraph may infer the shape of the inputs.
   // We need to copy the inferred input shapes back.
-  const auto &input_nids = idx.input_nodes();
   CHECK_EQ(input_nids.size(), in_shape->size());
   for (size_t i = 0; i < in_shape->size(); i++) {
     auto eid = idx.entry_id(input_nids[i], 0);
     // If the input shape is none, we should update them.
     if ((*in_shape)[i].ndim() == 0 || (*in_shape)[i].Size() == 0)
-      SHAPE_ASSIGN_CHECK(*in_shape, i, shapes[eid]);
+      SHAPE_ASSIGN_CHECK(*in_shape, i, shapes1[eid]);
   }
 
   // For the shape of output data.
   for (int i = 0; i < params.num_out_data; i++) {
     uint32_t eid = idx.entry_id(g.outputs[i]);
-    const auto& g_out_shape = shapes[eid];
+    const auto& g_out_shape = shapes1[eid];
     auto out = TShape(g_out_shape.ndim() + 1);
     out[0] = len;
     for (size_t i = 1; i < out.ndim(); i++)
@@ -286,14 +302,15 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
   // For the remaining shapes.
   for (size_t i = params.num_out_data; i < g.outputs.size(); i++) {
     uint32_t eid = idx.entry_id(g.outputs[i]);
-    SHAPE_ASSIGN_CHECK(*out_shape, i, shapes[eid]);
+    SHAPE_ASSIGN_CHECK(*out_shape, i, shapes1[eid]);
   }
   size_t num_states = g.outputs.size() - params.num_out_data;
   for (size_t i = 0; i < num_states; i++) {
     size_t loc = params.in_state_locs[i];
     CHECK((*out_shape)[i + params.num_out_data] == (*in_shape)[loc]);
   }
-  return true;
+  // Check if we have inferred the shapes correctly.
+  return g.GetAttr<size_t>("shape_num_unknown_nodes") == 0;
 }
 
 static bool ForeachType(const nnvm::NodeAttrs& attrs,
diff --git a/src/operator/subgraph_op_common.cc b/src/operator/subgraph_op_common.cc
index 000d72d8de96..8c8831f3fc2f 100644
--- a/src/operator/subgraph_op_common.cc
+++ b/src/operator/subgraph_op_common.cc
@@ -25,64 +25,97 @@ namespace mxnet {
 namespace op {
 
 bool InferSubgraphDataType(const nnvm::Symbol &subgraph,
-                           std::vector<int> *in_type,
-                           std::vector<int> *out_type) {
-  nnvm::DTypeVector dtype_inputs = *in_type;
+                           std::vector<int> *in_types,
+                           std::vector<int> *out_types) {
   nnvm::Graph g;
   g.outputs = subgraph.outputs;
-  const auto& idx = g.indexed_graph();
-  CHECK_EQ(idx.input_nodes().size(), in_type->size());
-  CHECK_EQ(idx.outputs().size(), out_type->size());
-  imperative::CheckAndInferType(&g, std::move(dtype_inputs), true);
-
-  const auto &dtypes = g.GetAttr<nnvm::DTypeVector>("dtype");
-
-  // Inferring the data type in the subgraph may infer the data type of the inputs.
-  // We need to copy the inferred input data types back.
-  const auto &input_nids = idx.input_nodes();
-  CHECK_EQ(input_nids.size(), in_type->size());
-  for (size_t i = 0; i < in_type->size(); i++) {
-    auto eid = idx.entry_id(input_nids[i], 0);
-    TYPE_ASSIGN_CHECK(*in_type, i, dtypes[eid]);
+  const auto& idx_g = g.indexed_graph();
+  CHECK_EQ(idx_g.input_nodes().size(), in_types->size());
+  CHECK_EQ(idx_g.outputs().size(), out_types->size());
+
+  // Put the input and output data types to the dtype vector.
+  nnvm::DTypeVector types(idx_g.num_node_entries(), -1);
+  const auto &input_nids = idx_g.input_nodes();
+  CHECK_EQ(input_nids.size(), in_types->size());
+  for (size_t i = 0; i < in_types->size(); i++) {
+    auto eid = idx_g.entry_id(input_nids[i], 0);
+    types[eid] = in_types->at(i);
+  }
+  CHECK_EQ(g.outputs.size(), out_types->size());
+  for (size_t i = 0; i < out_types->size(); i++) {
+    auto eid = idx_g.entry_id(g.outputs[i]);
+    types[eid] = out_types->at(i);
   }
 
-  for (size_t i = 0; i < g.outputs.size(); i++)
-    TYPE_ASSIGN_CHECK(*out_type, i, dtypes[idx.entry_id(g.outputs[i])]);
-  return true;
+  // Infer data type of the graph.
+  g.attrs["dtype"] = std::make_shared<dmlc::any>(std::move(types));
+  g = exec::InferType(std::move(g));
+
+  const auto& types1 = g.GetAttr<nnvm::DTypeVector>("dtype");
+  // assign to in_types
+  for (size_t i = 0; i < in_types->size(); ++i) {
+    const auto eid = idx_g.entry_id(input_nids[i], 0);
+    TYPE_ASSIGN_CHECK(*in_types, i, types1[eid]);
+  }
+  // assign to out_types
+  for (size_t i = 0; i < g.outputs.size(); ++i) {
+    const auto eid = idx_g.entry_id(g.outputs[i]);
+    TYPE_ASSIGN_CHECK(*out_types, i, types1[eid]);
+  }
+  // Check if we have inferred the dtypes correctly.
+  return g.GetAttr<size_t>("dtype_num_unknown_nodes") == 0;
 }
 
 bool InferSubgraphStorage(const nnvm::Symbol &subgraph,
                           const int dev_mask,
                           DispatchMode* dispatch_mode,
-                          std::vector<int> *in_attrs,
-                          std::vector<int> *out_attrs) {
+                          std::vector<int> *in_stypes,
+                          std::vector<int> *out_stypes) {
   nnvm::Graph g;
   g.outputs = subgraph.outputs;
-  const auto& idx = g.indexed_graph();
-  CHECK_EQ(idx.input_nodes().size(), in_attrs->size());
-  CHECK_EQ(idx.outputs().size(), out_attrs->size());
-  exec::DevMaskVector dev_masks(idx.num_nodes(), dev_mask);
-  StorageTypeVector storage_type_inputs = *in_attrs;
-  imperative::CheckAndInferStorageType(&g, std::move(dev_masks),
-                                       std::move(storage_type_inputs), true);
+  const auto& idx_g = g.indexed_graph();
+  CHECK_EQ(idx_g.input_nodes().size(), in_stypes->size());
+  CHECK_EQ(idx_g.outputs().size(), out_stypes->size());
+  exec::DevMaskVector dev_masks(idx_g.num_node_entries(), dev_mask);
 
-  const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
+  // Put the input and output storages to the storage vector.
+  nnvm::StorageVector stypes(idx_g.num_node_entries(), exec::kBadStorageID);
+  const auto &input_nids = idx_g.input_nodes();
+  CHECK_EQ(input_nids.size(), in_stypes->size());
+  for (size_t i = 0; i < in_stypes->size(); i++) {
+    auto eid = idx_g.entry_id(input_nids[i], 0);
+    stypes[eid] = in_stypes->at(i);
+  }
+  CHECK_EQ(g.outputs.size(), out_stypes->size());
+  for (size_t i = 0; i < out_stypes->size(); i++) {
+    auto eid = idx_g.entry_id(g.outputs[i]);
+    stypes[eid] = out_stypes->at(i);
+  }
+
+  // Infer storage type of the graph.
+  bool dev_match = g.attrs.count("dev_mask") &&
+                   g.GetAttr<exec::DevMaskVector>("dev_mask") == dev_masks;
+  if (!dev_match) {
+    g.attrs["dev_mask"] = std::make_shared<dmlc::any>(std::move(dev_masks));
+  }
+  g.attrs["storage_type"] = std::make_shared<dmlc::any>(std::move(stypes));
+  g = exec::InferStorageType(std::move(g));
 
-  // Inferring the storage in the subgraph may infer the storage of the inputs.
-  // We need to copy the inferred input storage back.
-  const auto &input_nids = idx.input_nodes();
-  CHECK_EQ(input_nids.size(), in_attrs->size());
-  for (size_t i = 0; i < in_attrs->size(); i++) {
-    auto eid = idx.entry_id(input_nids[i], 0);
-    STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, i, stypes[eid]);
+  const auto& stypes1 = g.GetAttr<StorageTypeVector>("storage_type");
+  // assign to in_types
+  for (size_t i = 0; i < in_stypes->size(); ++i) {
+    const auto eid = idx_g.entry_id(input_nids[i], 0);
+    STORAGE_TYPE_ASSIGN_CHECK(*in_stypes, i, stypes1[eid]);
   }
 
   DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
-  auto &outputs = idx.outputs();
-  CHECK(outputs.size() == out_attrs->size());
-  for (size_t i = 0; i < out_attrs->size(); i++)
-    STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, i, stypes[idx.entry_id(outputs[i])]);
-  return true;
+  // assign to out_types
+  for (size_t i = 0; i < g.outputs.size(); ++i) {
+    const auto eid = idx_g.entry_id(g.outputs[i]);
+    STORAGE_TYPE_ASSIGN_CHECK(*out_stypes, i, stypes1[eid]);
+  }
+  // Check if we have inferred the storages correctly.
+  return g.GetAttr<size_t>("storage_type_num_unknown_nodes") == 0;
 }
 
 bool InferSubgraphBackwardStorage(const nnvm::Symbol &subgraph,

From 0b0a36e53b4f71fcb7097bd0bbd78e1ae19772ce Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sun, 17 Jun 2018 19:24:06 +0000
Subject: [PATCH 097/135] update contrib API docs.

---
 docs/api/python/ndarray/contrib.md | 1 +
 docs/api/python/symbol/contrib.md  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/docs/api/python/ndarray/contrib.md b/docs/api/python/ndarray/contrib.md
index b017c601208e..36a2c151e859 100644
--- a/docs/api/python/ndarray/contrib.md
+++ b/docs/api/python/ndarray/contrib.md
@@ -52,6 +52,7 @@ In the rest of this document, we list routines provided by the `ndarray.contrib`
     fft
     ifft
     quantize
+    foreach
 ```
 
 ## API Reference
diff --git a/docs/api/python/symbol/contrib.md b/docs/api/python/symbol/contrib.md
index f2bb3f15deed..664716560506 100644
--- a/docs/api/python/symbol/contrib.md
+++ b/docs/api/python/symbol/contrib.md
@@ -52,6 +52,7 @@ In the rest of this document, we list routines provided by the `symbol.contrib`
     fft
     ifft
     quantize
+    foreach
 ```
 
 ## API Reference

From f1ff55d5b1a9ccad7f7e095113af7cf585ef9682 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Mon, 18 Jun 2018 19:07:18 +0000
Subject: [PATCH 098/135] support nested foreach.

---
 src/operator/subgraph_op_common.cc     |  3 ++
 tests/python/unittest/test_operator.py | 51 ++++++++++++++------------
 2 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/src/operator/subgraph_op_common.cc b/src/operator/subgraph_op_common.cc
index 8c8831f3fc2f..a3ae200a9f0e 100644
--- a/src/operator/subgraph_op_common.cc
+++ b/src/operator/subgraph_op_common.cc
@@ -207,6 +207,9 @@ void LoopState::Forward(int iter_no,
 
   std::vector<std::pair<std::string, std::string> > kwargs;
   kwargs.push_back(std::pair<std::string, std::string>("inline_limit", "0"));
+  // We turn on static_alloc for two reasons.
+  // It avoids the overhead of unnecessary memory allocation.
+  // only static_alloc supports nested call of CachedOp.
   kwargs.push_back(std::pair<std::string, std::string>("static_alloc", "1"));
   CachedOpPtr op;
   if (is_recording && iter_ops.size() > (size_t) iter_no)
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index eefe7efbffd7..02a97e160485 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -6128,48 +6128,51 @@ def step_in(in1, states):
         out = in1 * 2 + states[0]
         return (out, [out])
 
-    def step(in1, states):
+    def step_sym(in1, states):
         out1 = mx.sym.contrib.foreach(step_in, in1, states)
         out = mx.sym.broadcast_add(out1[0], states[0])
         return (out, [mx.sym.squeeze(mx.sym.slice(out, begin=(0, 0), end=(1, 2)))])
+    def step_nd(in1, states):
+        out1 = mx.nd.contrib.foreach(step_in, in1, states)
+        out = mx.nd.broadcast_add(out1[0], states[0])
+        return (out, [mx.nd.squeeze(mx.nd.slice(out, begin=(0, 0), end=(1, 2)))])
 
     data_sym = mx.sym.var("v1")
     state_sym = mx.sym.var("v2")
-    out = mx.sym.contrib.foreach(step, data_sym, [state_sym])
+    out, states = mx.sym.contrib.foreach(step_sym, data_sym, [state_sym])
+    assert isinstance(states, list)
+    assert len(states) == 1
+    out = mx.sym.broadcast_add(out, states[0])
 
-    out1 = _as_list(out[0])
-    for i in range(len(out1)):
-        out1[i] = out1[i]
-    out1.extend(out[1])
-    out = mx.sym.Group(out1)
     js_1 = out.tojson()
     out = mx.sym.load_json(js_1)
     js_2 = out.tojson()
     assert js_1 == js_2
 
-    data = mx.nd.arange(4).reshape((1, 2, 2))
+    data = mx.nd.arange(8).reshape((2, 2, 2))
     state = mx.nd.arange(2)
     data_grad = mx.nd.empty(data.shape)
     state_grad = mx.nd.empty(state.shape)
     e = out.bind(ctx=default_context(), args={'v1':data, 'v2':state},
             args_grad={'v1':data_grad, 'v2':state_grad})
     e.forward(is_train=True)
-    out = mx.nd.zeros_like(data)
-    for i in range(data.shape[0]):
-        data1 = data[i]
-        out1 = mx.nd.zeros_like(data1)
-        for j in range(data1.shape[0]):
-            if (j > 0):
-                out1[j] = out1[j-1] + data1[j] * 2
-            else:
-                out1[j] = data1[j] * 2  + state
-        if (i > 0):
-            state = mx.nd.squeeze(mx.nd.slice(out[i-1], begin=(0, 0), end=(1, 2)))
-            out[i] = mx.nd.broadcast_add(out1, state)
-        else:
-            out[i] = mx.nd.broadcast_add(out1, state)
-    out = out
-    assert_almost_equal(out.asnumpy(), e.outputs[0].asnumpy(), rtol=0.001, atol=0.0001)
+    out_grads = []
+    for out in e.outputs:
+        out_grads.append(mx.nd.random.uniform(shape=out.shape))
+    e.backward(out_grads)
+
+    data.attach_grad()
+    state.attach_grad()
+    with mx.autograd.record():
+        out, states = mx.nd.contrib.foreach(step_nd, data, [state])
+        assert isinstance(states, list)
+        assert len(states) == 1
+        res = mx.nd.broadcast_add(out, states[0])
+    assert_almost_equal(res.asnumpy(), e.outputs[0].asnumpy(), rtol=0.001, atol=0.0001)
+
+    res.backward(out_grads[0])
+    assert_almost_equal(data.grad.asnumpy(), data_grad.asnumpy())
+    assert_almost_equal(state.grad.asnumpy(), state_grad.asnumpy())
 
 
 @with_seed()

From 156f1c8b5b4f7d110ca45d990ba8bf601c696fdb Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 19 Jun 2018 01:48:34 +0000
Subject: [PATCH 099/135] use a single CachedOp for all iterations.

---
 src/operator/subgraph_op_common.cc | 45 +++++++++++-------------------
 src/operator/subgraph_op_common.h  |  7 ++---
 2 files changed, 18 insertions(+), 34 deletions(-)

diff --git a/src/operator/subgraph_op_common.cc b/src/operator/subgraph_op_common.cc
index a3ae200a9f0e..8004ba7efd73 100644
--- a/src/operator/subgraph_op_common.cc
+++ b/src/operator/subgraph_op_common.cc
@@ -184,6 +184,19 @@ bool InferSubgraphBackwardStorage(const nnvm::Symbol &subgraph,
   return true;
 }
 
+LoopState::LoopState(const Symbol &g) {
+  this->subgraph_sym = g;
+  this->subgraph.outputs = g.outputs;
+
+  std::vector<std::pair<std::string, std::string> > kwargs;
+  kwargs.push_back(std::pair<std::string, std::string>("inline_limit", "0"));
+  // We turn on static_alloc for two reasons.
+  // It avoids the overhead of unnecessary memory allocation.
+  // only static_alloc supports nested call of CachedOp.
+  kwargs.push_back(std::pair<std::string, std::string>("static_alloc", "1"));
+  iter_op = std::make_shared<CachedOp>(subgraph_sym, kwargs);
+}
+
 void LoopState::Forward(int iter_no,
                         std::vector<NDArray> cinputs,
                         const std::vector<OpReqType>& req,
@@ -205,33 +218,7 @@ void LoopState::Forward(int iter_no,
   for (size_t i = 0; i < outputs.size(); i++)
     outputs[i] = &coutputs[i];
 
-  std::vector<std::pair<std::string, std::string> > kwargs;
-  kwargs.push_back(std::pair<std::string, std::string>("inline_limit", "0"));
-  // We turn on static_alloc for two reasons.
-  // It avoids the overhead of unnecessary memory allocation.
-  // only static_alloc supports nested call of CachedOp.
-  kwargs.push_back(std::pair<std::string, std::string>("static_alloc", "1"));
-  CachedOpPtr op;
-  if (is_recording && iter_ops.size() > (size_t) iter_no)
-    op = iter_ops[iter_no];
-  else if (!is_recording && iter_ops.size() == 1)
-    op = iter_ops[0];
-
-  // If we need to run backward and we don't have a cached op for this iteration,
-  // we create one for this iteration.
-  if (is_recording && op == nullptr) {
-    op = std::make_shared<CachedOp>(subgraph_sym, kwargs);
-    CHECK_EQ(iter_ops.size(), iter_no);
-    iter_ops.push_back(op);
-  } else if (op == nullptr) {
-    // If we don't need to run backward and this is the first time of
-    // running the iteration, we need to create a new cached op.
-    op = std::make_shared<CachedOp>(subgraph_sym, kwargs);
-    CHECK(iter_ops.empty());
-    iter_ops.push_back(op);
-  }
-  OpStatePtr state = op->Forward(nullptr, inputs, outputs);
-
+  OpStatePtr state = iter_op->Forward(nullptr, inputs, outputs);
   if (is_recording) {
     all_inputs.push_back(cinputs);
     all_outputs.push_back(coutputs);
@@ -248,9 +235,9 @@ void LoopState::Backward(int iter_no,
   using namespace nnvm;
   using namespace imperative;
 
-  CHECK_GT(iter_ops.size(), iter_no)
+  CHECK_GT(all_states.size(), iter_no)
       << "We didn't record the computation for iteration " << iter_no;
-  auto op = iter_ops[iter_no];
+  auto op = iter_op;
   std::vector<NDArray *> inputs;
   std::vector<NDArray *> outputs;
   inputs.reserve(op->num_backward_inputs());
diff --git a/src/operator/subgraph_op_common.h b/src/operator/subgraph_op_common.h
index 4243b6b0f9dc..448742ac908c 100644
--- a/src/operator/subgraph_op_common.h
+++ b/src/operator/subgraph_op_common.h
@@ -71,16 +71,13 @@ class LoopState {
   // For training, each iteration has a cached op because each iteration
   // needs to maintain a set of memory buffers for all computation states,
   // which will be used in the backward.
-  std::vector<CachedOpPtr> iter_ops;
+  CachedOpPtr iter_op;
   std::vector<OpStatePtr> all_states;
   Symbol subgraph_sym;
   nnvm::Graph subgraph;
 
  public:
-  explicit LoopState(const Symbol &g) {
-    this->subgraph_sym = g;
-    this->subgraph.outputs = g.outputs;
-  }
+  explicit LoopState(const Symbol &g);
 
   void Forward(int iter_no,
                std::vector<NDArray> cinputs,

From 871fd3bbfa464ec1765b56e5ed2fe7567fcc8880 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 19 Jun 2018 03:09:54 +0000
Subject: [PATCH 100/135] use large dim.

---
 example/rnn/benchmark_rnn.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/example/rnn/benchmark_rnn.py b/example/rnn/benchmark_rnn.py
index 285239344a13..5e499755dbed 100644
--- a/example/rnn/benchmark_rnn.py
+++ b/example/rnn/benchmark_rnn.py
@@ -106,11 +106,11 @@ def test_contrib_rnn(batch_size, input_size, hidden_size, seq_len, ctx):
 
 if __name__ == '__main__':
     print("Benchmark in CPU (batch size: 1)")
-    test_contrib_rnn(1, 100, 100, 100, mx.cpu(0))
+    test_contrib_rnn(1, 500, 500, 100, mx.cpu(0))
     print("Benchmark in CPU (batch size: 32)")
-    test_contrib_rnn(32, 100, 100, 100, mx.cpu(0))
+    test_contrib_rnn(32, 500, 500, 100, mx.cpu(0))
     if len(get_gpus()) > 0:
         print("Benchmark in GPU (batch size: 1)")
-        test_contrib_rnn(1, 100, 100, 100, mx.gpu(0))
+        test_contrib_rnn(1, 500, 500, 100, mx.gpu(0))
         print("Benchmark in GPU (batch size: 32)")
-        test_contrib_rnn(32, 100, 100, 100, mx.gpu(0))
+        test_contrib_rnn(32, 500, 500, 100, mx.gpu(0))

From b5dfc3f20c1ff71f16af76199edca1f1e5265d8b Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 19 Jun 2018 03:46:58 +0000
Subject: [PATCH 101/135] update benchmark.

---
 example/rnn/benchmark_rnn.py | 61 +++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/example/rnn/benchmark_rnn.py b/example/rnn/benchmark_rnn.py
index 5e499755dbed..baac0373c344 100644
--- a/example/rnn/benchmark_rnn.py
+++ b/example/rnn/benchmark_rnn.py
@@ -19,6 +19,7 @@
 import mxnet as mx
 from mxnet import gluon
 import time
+import copy
 
 def get_gpus():
     """
@@ -31,26 +32,26 @@ def get_gpus():
     return range(len([i for i in re.split('\n') if 'GPU' in i]))
 
 class TestRNNLayer(gluon.HybridBlock):
-    def __init__(self, hidden_size, prefix=None, params=None):
+    def __init__(self, cell, prefix=None, params=None):
         super(TestRNNLayer, self).__init__(prefix=prefix, params=params)
-        self.cell = gluon.rnn.RNNCell(hidden_size, prefix='rnn_')
+        self.cell = cell
 
     def hybrid_forward(self, F, inputs, states):
-        states = [states]
         out, states = F.contrib.foreach(self.cell, inputs, states)
         return out
 
-def test_contrib_rnn(batch_size, input_size, hidden_size, seq_len, ctx):
-    rnn_data = mx.nd.normal(loc=0, scale=1, shape=(seq_len, batch_size, input_size), ctx=ctx)
-    states = mx.nd.normal(loc=0, scale=1, shape=(batch_size, hidden_size), ctx=ctx)
+def test_contrib_rnn(cell, rnn_data, states):
+    ctx = rnn_data.context
     num_batches = 20
+    cell1 = copy.deepcopy(cell)
+    cell2 = copy.deepcopy(cell)
 
     # Imperative
-    layer1 = TestRNNLayer(hidden_size)
+    layer1 = TestRNNLayer(cell1)
     layer1.initialize(ctx=ctx)
 
     # Hybridize
-    layer2 = TestRNNLayer(hidden_size)
+    layer2 = TestRNNLayer(cell2)
     layer2.initialize(ctx=ctx)
     layer2.hybridize()
 
@@ -66,22 +67,18 @@ def test_contrib_rnn(batch_size, input_size, hidden_size, seq_len, ctx):
         mx.nd.waitall()
     print("Hybrid inference takes " + str(time.time() - tic))
 
-    #trainer = gluon.Trainer(layer1.collect_params(), 'sgd', {'learning_rate' : 0.03})
     tic = time.time()
     for i in range(num_batches):
         with mx.autograd.record():
             res1 = layer1(rnn_data, states)
         res1.backward()
-        #trainer.step(batch_size)
     print("Imperative training takes " + str(time.time() - tic))
 
-    #trainer = gluon.Trainer(layer2.collect_params(), 'sgd', {'learning_rate' : 0.03})
     tic = time.time()
     for i in range(num_batches):
         with mx.autograd.record():
             res2 = layer2(rnn_data, states)
         res2.backward()
-        #trainer.step(batch_size)
     print("Hybrid training takes " + str(time.time() - tic))
 
     layer2.export("foreach_rnn")
@@ -92,7 +89,8 @@ def test_contrib_rnn(batch_size, input_size, hidden_size, seq_len, ctx):
     for key in params.keys():
         args1[key] = params[key].data()
     args1['data0'] = rnn_data
-    args1['data1'] = states
+    for i in range(len(states)):
+        args1['data' + str(i + 1)] = states[i]
     # gradients for the backward of the foreach symbol
     args_grad1 = {}
     for key in args1.keys():
@@ -105,12 +103,31 @@ def test_contrib_rnn(batch_size, input_size, hidden_size, seq_len, ctx):
     print("Symbol training takes " + str(time.time() - tic))
 
 if __name__ == '__main__':
-    print("Benchmark in CPU (batch size: 1)")
-    test_contrib_rnn(1, 500, 500, 100, mx.cpu(0))
-    print("Benchmark in CPU (batch size: 32)")
-    test_contrib_rnn(32, 500, 500, 100, mx.cpu(0))
-    if len(get_gpus()) > 0:
-        print("Benchmark in GPU (batch size: 1)")
-        test_contrib_rnn(1, 500, 500, 100, mx.gpu(0))
-        print("Benchmark in GPU (batch size: 32)")
-        test_contrib_rnn(32, 500, 500, 100, mx.gpu(0))
+    ndim = 500
+    seq_len = 100
+    batch_sizes = [1, 32]
+    cells = [gluon.rnn.RNNCell(ndim, prefix='rnn_'),
+             gluon.rnn.LSTMCell(ndim, prefix='rnn_')]
+    ctxs = [mx.cpu(0), mx.gpu(0)]
+    for ctx in ctxs:
+        for batch_size in batch_sizes:
+            for cell in cells:
+                if len(get_gpus()) == 0 and ctx == mx.gpu(0):
+                    continue
+
+                if isinstance(cell, gluon.rnn.RNNCell):
+                    rnn_data = mx.nd.normal(loc=0, scale=1, shape=(seq_len, batch_size, ndim),
+                                            ctx=mx.cpu(0))
+                    states = []
+                    states.append(mx.nd.normal(loc=0, scale=1, shape=(batch_size, ndim),
+                                               ctx=mx.cpu(0)))
+                elif isinstance(cell, gluon.rnn.LSTMCell):
+                    rnn_data = mx.nd.normal(loc=0, scale=1, shape=(seq_len, batch_size, ndim),
+                                            ctx=mx.cpu(0))
+                    states = []
+                    states.append(mx.nd.normal(loc=0, scale=1, shape=(batch_size, ndim),
+                                               ctx=mx.cpu(0)))
+                    states.append(mx.nd.normal(loc=0, scale=1, shape=(batch_size, ndim),
+                                               ctx=mx.cpu(0)))
+                print("Benchmark {} in CPU (batch size: {})".format(cell._alias(), batch_size))
+                test_contrib_rnn(cell, rnn_data, states)

From 202a74c2ae650824c40950f739317190d016b477 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 19 Jun 2018 16:58:37 +0000
Subject: [PATCH 102/135] update benchmark.

---
 example/rnn/benchmark_rnn.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/example/rnn/benchmark_rnn.py b/example/rnn/benchmark_rnn.py
index baac0373c344..f1fd75a0df15 100644
--- a/example/rnn/benchmark_rnn.py
+++ b/example/rnn/benchmark_rnn.py
@@ -43,18 +43,24 @@ def hybrid_forward(self, F, inputs, states):
 def test_contrib_rnn(cell, rnn_data, states):
     ctx = rnn_data.context
     num_batches = 20
-    cell1 = copy.deepcopy(cell)
-    cell2 = copy.deepcopy(cell)
 
     # Imperative
+    cell1 = copy.deepcopy(cell)
     layer1 = TestRNNLayer(cell1)
     layer1.initialize(ctx=ctx)
 
     # Hybridize
+    cell2 = copy.deepcopy(cell)
     layer2 = TestRNNLayer(cell2)
     layer2.initialize(ctx=ctx)
     layer2.hybridize()
 
+    # Hybridize
+    cell3 = copy.deepcopy(cell)
+    cell3.hybridize(static_alloc=True)
+    layer3 = TestRNNLayer(cell3)
+    layer3.initialize(ctx=ctx)
+
     tic = time.time()
     for i in range(num_batches):
         res1 = layer1(rnn_data, states)
@@ -67,6 +73,12 @@ def test_contrib_rnn(cell, rnn_data, states):
         mx.nd.waitall()
     print("Hybrid inference takes " + str(time.time() - tic))
 
+    tic = time.time()
+    for i in range(num_batches):
+        res3 = layer3(rnn_data, states)
+        mx.nd.waitall()
+    print("Hybrid-cell inference takes " + str(time.time() - tic))
+
     tic = time.time()
     for i in range(num_batches):
         with mx.autograd.record():
@@ -81,6 +93,13 @@ def test_contrib_rnn(cell, rnn_data, states):
         res2.backward()
     print("Hybrid training takes " + str(time.time() - tic))
 
+    tic = time.time()
+    for i in range(num_batches):
+        with mx.autograd.record():
+            res3 = layer3(rnn_data, states)
+        res3.backward()
+    print("Hybrid-cell training takes " + str(time.time() - tic))
+
     layer2.export("foreach_rnn")
     symnet = mx.symbol.load('foreach_rnn-symbol.json')
     # Inputs

From 0606c3c1c049d2d20c7a149bba7b01a1fc5e9354 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 19 Jun 2018 17:59:28 +0000
Subject: [PATCH 103/135] update benchmark.

---
 example/rnn/benchmark_rnn.py | 56 ++++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 19 deletions(-)

diff --git a/example/rnn/benchmark_rnn.py b/example/rnn/benchmark_rnn.py
index f1fd75a0df15..1e605ddee034 100644
--- a/example/rnn/benchmark_rnn.py
+++ b/example/rnn/benchmark_rnn.py
@@ -40,12 +40,13 @@ def hybrid_forward(self, F, inputs, states):
         out, states = F.contrib.foreach(self.cell, inputs, states)
         return out
 
-def test_contrib_rnn(cell, rnn_data, states):
+def benchmark_rnn(cell, rnn_data, states):
     ctx = rnn_data.context
     num_batches = 20
 
     # Imperative
     cell1 = copy.deepcopy(cell)
+    cell1.hybridize()
     layer1 = TestRNNLayer(cell1)
     layer1.initialize(ctx=ctx)
 
@@ -54,6 +55,7 @@ def test_contrib_rnn(cell, rnn_data, states):
     layer2 = TestRNNLayer(cell2)
     layer2.initialize(ctx=ctx)
     layer2.hybridize()
+    layer2(rnn_data, states)
 
     # Hybridize
     cell3 = copy.deepcopy(cell)
@@ -79,11 +81,28 @@ def test_contrib_rnn(cell, rnn_data, states):
         mx.nd.waitall()
     print("Hybrid-cell inference takes " + str(time.time() - tic))
 
+    layer2.export("foreach_rnn")
+    symnet = mx.symbol.load('foreach_rnn-symbol.json')
+    args1 = {}
+    params = layer2.collect_params()
+    for key in params.keys():
+        args1[key] = params[key].data()
+    args1['data0'] = rnn_data
+    for i in range(len(states)):
+        args1['data' + str(i + 1)] = states[i]
+    exe = symnet.bind(ctx=ctx, args=args1)
+    tic = time.time()
+    for i in range(num_batches):
+        exe.forward(is_train=False)
+        mx.nd.waitall()
+    print("Symbol inference takes " + str(time.time() - tic))
+
     tic = time.time()
     for i in range(num_batches):
         with mx.autograd.record():
             res1 = layer1(rnn_data, states)
         res1.backward()
+        mx.nd.waitall()
     print("Imperative training takes " + str(time.time() - tic))
 
     tic = time.time()
@@ -91,6 +110,7 @@ def test_contrib_rnn(cell, rnn_data, states):
         with mx.autograd.record():
             res2 = layer2(rnn_data, states)
         res2.backward()
+        mx.nd.waitall()
     print("Hybrid training takes " + str(time.time() - tic))
 
     tic = time.time()
@@ -98,18 +118,9 @@ def test_contrib_rnn(cell, rnn_data, states):
         with mx.autograd.record():
             res3 = layer3(rnn_data, states)
         res3.backward()
+        mx.nd.waitall()
     print("Hybrid-cell training takes " + str(time.time() - tic))
 
-    layer2.export("foreach_rnn")
-    symnet = mx.symbol.load('foreach_rnn-symbol.json')
-    # Inputs
-    args1 = {}
-    params = layer2.collect_params()
-    for key in params.keys():
-        args1[key] = params[key].data()
-    args1['data0'] = rnn_data
-    for i in range(len(states)):
-        args1['data' + str(i + 1)] = states[i]
     # gradients for the backward of the foreach symbol
     args_grad1 = {}
     for key in args1.keys():
@@ -119,22 +130,24 @@ def test_contrib_rnn(cell, rnn_data, states):
     for i in range(num_batches):
         exe.forward(is_train=True)
         exe.backward(res2)
+        mx.nd.waitall()
     print("Symbol training takes " + str(time.time() - tic))
+    print("")
 
 if __name__ == '__main__':
-    ndim = 500
+    ndim = 512
     seq_len = 100
     batch_sizes = [1, 32]
-    cells = [gluon.rnn.RNNCell(ndim, prefix='rnn_'),
+    cells = [gluon.rnn.GRUCell(ndim, prefix='rnn_'),
              gluon.rnn.LSTMCell(ndim, prefix='rnn_')]
     ctxs = [mx.cpu(0), mx.gpu(0)]
-    for ctx in ctxs:
-        for batch_size in batch_sizes:
-            for cell in cells:
+    for cell in cells:
+        for ctx in ctxs:
+            for batch_size in batch_sizes:
                 if len(get_gpus()) == 0 and ctx == mx.gpu(0):
                     continue
 
-                if isinstance(cell, gluon.rnn.RNNCell):
+                if isinstance(cell, gluon.rnn.GRUCell):
                     rnn_data = mx.nd.normal(loc=0, scale=1, shape=(seq_len, batch_size, ndim),
                                             ctx=mx.cpu(0))
                     states = []
@@ -148,5 +161,10 @@ def test_contrib_rnn(cell, rnn_data, states):
                                                ctx=mx.cpu(0)))
                     states.append(mx.nd.normal(loc=0, scale=1, shape=(batch_size, ndim),
                                                ctx=mx.cpu(0)))
-                print("Benchmark {} in CPU (batch size: {})".format(cell._alias(), batch_size))
-                test_contrib_rnn(cell, rnn_data, states)
+                if ctx == mx.gpu(0):
+                    dev = "GPU"
+                else:
+                    dev = "CPU"
+                print("Benchmark {} in {} (batch size: {})".format(cell._alias(), dev,
+                                                                   batch_size))
+                benchmark_rnn(cell, rnn_data, states)

From 6019de510a71dd6195334324b700787c7082fca8 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 19 Jun 2018 19:25:48 +0000
Subject: [PATCH 104/135] update benchmark.

---
 example/rnn/benchmark_rnn.py | 39 +++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/example/rnn/benchmark_rnn.py b/example/rnn/benchmark_rnn.py
index 1e605ddee034..5e41b7508b66 100644
--- a/example/rnn/benchmark_rnn.py
+++ b/example/rnn/benchmark_rnn.py
@@ -45,6 +45,11 @@ def benchmark_rnn(cell, rnn_data, states):
     num_batches = 20
 
     # Imperative
+    cell0 = copy.deepcopy(cell)
+    layer0 = TestRNNLayer(cell0)
+    layer0.initialize(ctx=ctx)
+
+    # Hybridize
     cell1 = copy.deepcopy(cell)
     cell1.hybridize()
     layer1 = TestRNNLayer(cell1)
@@ -65,21 +70,27 @@ def benchmark_rnn(cell, rnn_data, states):
 
     tic = time.time()
     for i in range(num_batches):
-        res1 = layer1(rnn_data, states)
+        res0 = layer0(rnn_data, states)
         mx.nd.waitall()
     print("Imperative inference takes " + str(time.time() - tic))
 
     tic = time.time()
     for i in range(num_batches):
-        res2 = layer2(rnn_data, states)
+        res1 = layer1(rnn_data, states)
         mx.nd.waitall()
-    print("Hybrid inference takes " + str(time.time() - tic))
+    print("Hybrid-cell inference takes " + str(time.time() - tic))
 
     tic = time.time()
     for i in range(num_batches):
         res3 = layer3(rnn_data, states)
         mx.nd.waitall()
-    print("Hybrid-cell inference takes " + str(time.time() - tic))
+    print("Static-hybrid-cell inference takes " + str(time.time() - tic))
+
+    tic = time.time()
+    for i in range(num_batches):
+        res2 = layer2(rnn_data, states)
+        mx.nd.waitall()
+    print("Hybrid inference takes " + str(time.time() - tic))
 
     layer2.export("foreach_rnn")
     symnet = mx.symbol.load('foreach_rnn-symbol.json')
@@ -100,18 +111,18 @@ def benchmark_rnn(cell, rnn_data, states):
     tic = time.time()
     for i in range(num_batches):
         with mx.autograd.record():
-            res1 = layer1(rnn_data, states)
-        res1.backward()
+            res0 = layer0(rnn_data, states)
+        res0.backward()
         mx.nd.waitall()
     print("Imperative training takes " + str(time.time() - tic))
 
     tic = time.time()
     for i in range(num_batches):
         with mx.autograd.record():
-            res2 = layer2(rnn_data, states)
-        res2.backward()
+            res1 = layer1(rnn_data, states)
+        res1.backward()
         mx.nd.waitall()
-    print("Hybrid training takes " + str(time.time() - tic))
+    print("Hybrid-cell training takes " + str(time.time() - tic))
 
     tic = time.time()
     for i in range(num_batches):
@@ -119,7 +130,15 @@ def benchmark_rnn(cell, rnn_data, states):
             res3 = layer3(rnn_data, states)
         res3.backward()
         mx.nd.waitall()
-    print("Hybrid-cell training takes " + str(time.time() - tic))
+    print("Static-hybrid-cell training takes " + str(time.time() - tic))
+
+    tic = time.time()
+    for i in range(num_batches):
+        with mx.autograd.record():
+            res2 = layer2(rnn_data, states)
+        res2.backward()
+        mx.nd.waitall()
+    print("Hybrid training takes " + str(time.time() - tic))
 
     # gradients for the backward of the foreach symbol
     args_grad1 = {}

From 045186df9e5c2c9088d17af2118d2ccae45cc09d Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 20 Jun 2018 18:41:04 +0000
Subject: [PATCH 105/135] return symbol arrays correctly in
 MXSymbolCutSubgraph.

---
 include/mxnet/c_api.h          |  2 +-
 python/mxnet/symbol/contrib.py |  9 +++++----
 src/c_api/c_api_symbolic.cc    | 11 +++++++----
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 528a76bbf7d0..15d213273ad4 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1070,7 +1070,7 @@ MXNET_DLL int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle *inputs,
  * \param inputs The nodes that connect to the subgraph.
  * \param input_size The number of such nodes.
  */
-MXNET_DLL int MXSymbolCutSubgraph(SymbolHandle sym, SymbolHandle *inputs,
+MXNET_DLL int MXSymbolCutSubgraph(SymbolHandle sym, SymbolHandle **inputs,
                                   int *input_size);
 
 /*!
diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 40f45878f1af..18fb4ed71333 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -112,13 +112,14 @@ def _get_graph_inputs(subg):
     return syms
 
 def _cut_subgraph(subg):
-    num_handles = ctypes.c_int(1000)
-    handles = c_array(SymbolHandle, [SymbolHandle(0) for i in range(1000)])
-    check_call(_LIB.MXSymbolCutSubgraph(subg.handle, handles, ctypes.byref(num_handles)))
+    num_handles = ctypes.c_int(0)
+    handles = ctypes.POINTER(SymbolHandle)()
+    check_call(_LIB.MXSymbolCutSubgraph(subg.handle, ctypes.byref(handles),
+                                        ctypes.byref(num_handles)))
 
     syms = []
     for i in range(num_handles.value):
-        s = Symbol(SymbolHandle(handles[i]))
+        s = Symbol(ctypes.cast(handles[i], SymbolHandle))
         syms.append(s)
     return syms
 
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 98ce79959798..1352c77bc628 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -364,13 +364,12 @@ int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle *input_arr, int *inpu
   API_END_HANDLE_ERROR();
 }
 
-int MXSymbolCutSubgraph(SymbolHandle sym, SymbolHandle *input_symbols,
+int MXSymbolCutSubgraph(SymbolHandle sym, SymbolHandle **input_symbols,
                         int *input_size) {
   // Given a graph, we want to fetch the nodes that have been marked as part of
   // a subgraph.
   API_BEGIN();
   nnvm::Symbol *s = static_cast<nnvm::Symbol*>(sym);
-  size_t max_input_size = *input_size;
   std::string subg_attr = "__subgraph_name__";
   auto out_node = s->outputs[0].node;
   auto it = out_node->attrs.dict.find(subg_attr);
@@ -400,9 +399,13 @@ int MXSymbolCutSubgraph(SymbolHandle sym, SymbolHandle *input_symbols,
       input_syms[i] = new nnvm::Symbol();
       input_syms[i]->outputs.push_back(orig_entries[i]);
     }
-    CHECK(input_syms.size() <= max_input_size);
     *input_size = input_syms.size();
-    memcpy(input_symbols, input_syms.data(), sizeof(*input_symbols) * input_syms.size());
+
+    MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+    ret->ret_handles.clear();
+    ret->ret_handles.reserve(*input_size);
+    for (int i = 0; i < *input_size; ++i) ret->ret_handles.push_back(input_syms[i]);
+    *input_symbols = reinterpret_cast<SymbolHandle*>(dmlc::BeginPtr(ret->ret_handles));
   } else {
     *input_size = 0;
   }

From 484309e5005bf7ac27908b90deb219939f9cfc8b Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 20 Jun 2018 20:28:51 +0000
Subject: [PATCH 106/135] return symbol arrays in MXSymbolGetInputSymbols.

---
 include/mxnet/c_api.h          |  2 +-
 python/mxnet/symbol/contrib.py |  9 +++++----
 src/c_api/c_api_symbolic.cc    | 11 +++++++----
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 15d213273ad4..6c7626b917a4 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1058,7 +1058,7 @@ MXNET_DLL int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator creator,
  * \param inputs The input symbols of the graph.
  * \param input_size the number of input symbols returned.
  */
-MXNET_DLL int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle *inputs,
+MXNET_DLL int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle **inputs,
                                       int *input_size);
 
 /*!
diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 18fb4ed71333..e44a20d42a56 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -101,13 +101,14 @@ def rand_zipfian(true_classes, num_sampled, range_max):
     return sampled_classes, expected_count_true, expected_count_sampled
 
 def _get_graph_inputs(subg):
-    num_handles = ctypes.c_int(1000)
-    handles = c_array(SymbolHandle, [SymbolHandle(0) for i in range(1000)])
-    check_call(_LIB.MXSymbolGetInputSymbols(subg.handle, handles, ctypes.byref(num_handles)))
+    num_handles = ctypes.c_int(0)
+    handles = ctypes.POINTER(SymbolHandle)()
+    check_call(_LIB.MXSymbolGetInputSymbols(subg.handle, ctypes.byref(handles),
+                                            ctypes.byref(num_handles)))
 
     syms = []
     for i in range(num_handles.value):
-        s = Symbol(SymbolHandle(handles[i]))
+        s = Symbol(ctypes.cast(handles[i], SymbolHandle))
         syms.append(s)
     return syms
 
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 1352c77bc628..524723be1e93 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -353,14 +353,17 @@ extern bool CutGraphInputs(const std::vector<nnvm::NodeEntry *> &input_entries,
 
 }
 
-int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle *input_arr, int *input_size) {
+int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle **input_arr, int *input_size) {
   API_BEGIN();
   nnvm::Symbol *s = static_cast<nnvm::Symbol*>(sym);
-  size_t max_input_size = *input_size;
   std::vector<nnvm::Symbol *> input_syms = mxnet::GetInputSymbols(*s);
-  CHECK(input_syms.size() <= max_input_size);
   *input_size = input_syms.size();
-  std::copy(input_syms.begin(), input_syms.end(), input_arr);
+
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  ret->ret_handles.clear();
+  ret->ret_handles.reserve(*input_size);
+  for (int i = 0; i < *input_size; ++i) ret->ret_handles.push_back(input_syms[i]);
+  *input_arr = reinterpret_cast<SymbolHandle*>(dmlc::BeginPtr(ret->ret_handles));
   API_END_HANDLE_ERROR();
 }
 

From 0ebd5e5fed976711a0c47714fd14efb0eb94708f Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 20 Jun 2018 21:17:24 +0000
Subject: [PATCH 107/135] fix lint error.

---
 python/mxnet/symbol/contrib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index e44a20d42a56..058bbb610177 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -30,7 +30,7 @@
     pass
 
 from . import symbol
-from ..base import _LIB, c_array, check_call
+from ..base import _LIB, check_call
 from ..base import SymbolHandle, _as_list
 from ..attribute import AttrScope
 

From a9e253dfa956192e5c64e8b27fa03d61f0fe7cb2 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Thu, 21 Jun 2018 01:14:18 +0000
Subject: [PATCH 108/135] use cachedop to infer storage in backward.

---
 src/operator/control_flow.cc       |  6 ++-
 src/operator/subgraph_op_common.cc | 66 ------------------------------
 src/operator/subgraph_op_common.h  | 10 -----
 3 files changed, 4 insertions(+), 78 deletions(-)

diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index 2a6421021686..4b11d77150be 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -341,8 +341,10 @@ static bool BackwardForeachStorageType(const nnvm::NodeAttrs& attrs,
   const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
   CHECK_EQ(out_attrs->size(), (size_t) params.num_args - 1);
   CHECK_EQ(attrs.subgraphs.size(), 1U);
-  return InferSubgraphBackwardStorage(*attrs.subgraphs[0], dev_mask,
-                                      dispatch_mode, in_attrs, out_attrs);
+  CachedOp op(*attrs.subgraphs[0],
+              std::vector<std::pair<std::string, std::string> >());
+  return op.BackwardStorageType(attrs, dev_mask, dispatch_mode,
+                                in_attrs, out_attrs);
 }
 
 static OpStatePtr CreateForeachState(const NodeAttrs& attrs,
diff --git a/src/operator/subgraph_op_common.cc b/src/operator/subgraph_op_common.cc
index 8004ba7efd73..d7b3de2866f2 100644
--- a/src/operator/subgraph_op_common.cc
+++ b/src/operator/subgraph_op_common.cc
@@ -118,72 +118,6 @@ bool InferSubgraphStorage(const nnvm::Symbol &subgraph,
   return g.GetAttr<size_t>("storage_type_num_unknown_nodes") == 0;
 }
 
-bool InferSubgraphBackwardStorage(const nnvm::Symbol &subgraph,
-                                  const int dev_mask,
-                                  DispatchMode* dispatch_mode,
-                                  std::vector<int> *in_attrs,
-                                  std::vector<int> *out_attrs) {
-  using namespace nnvm;
-  // construct backward graph
-  nnvm::Graph grad_graph;
-  nnvm::Graph fwd_graph;
-  std::vector<Node *> potential_nodes;
-  {
-    fwd_graph.outputs = subgraph.outputs;
-    std::vector<nnvm::NodeEntry> ograd_entries;
-    ograd_entries.reserve(fwd_graph.outputs.size());
-    for (size_t i = 0; i < fwd_graph.outputs.size(); ++i) {
-      ograd_entries.emplace_back(NodeEntry{Node::Create(), 0, 0});
-    }
-
-    std::vector<NodeEntry> xs;
-    std::vector<NodePtr> args = subgraph.ListInputs(nnvm::Symbol::kReadOnlyArgs);
-    xs.reserve(args.size());
-    for (const auto& i : args)
-      xs.emplace_back(NodeEntry{i, 0, 0});
-    CHECK_GT(xs.size(), 0)
-        << "There are no inputs in computation graph that require gradients.";
-
-    static const std::vector<const Op*> zero_ops{Op::Get("zeros_like"), Op::Get("_zeros")};
-    grad_graph = pass::Gradient(
-        fwd_graph, fwd_graph.outputs, xs, ograd_entries,
-        exec::AggregateGradient, nullptr, nullptr,
-        zero_ops, "_copy");
-    potential_nodes.reserve(fwd_graph.outputs.size() + xs.size() + ograd_entries.size());
-    for (auto e : ograd_entries)
-      potential_nodes.push_back(e.node.get());
-    for (auto e : xs)
-      potential_nodes.push_back(e.node.get());
-    for (auto e : fwd_graph.outputs)
-      potential_nodes.push_back(e.node.get());
-  }
-
-  const auto& idx = grad_graph.indexed_graph();
-  auto input_nodes = idx.input_nodes();
-  StorageTypeVector storage_type_inputs(input_nodes.size());
-  for (size_t i = 0; i < input_nodes.size(); i++) {
-    auto node_id = input_nodes[i];
-    const nnvm::IndexedGraph::Node &n = idx[node_id];
-    auto it = std::find(potential_nodes.begin(), potential_nodes.end(), n.source);
-    CHECK(it != potential_nodes.end());
-    size_t idx = it - potential_nodes.begin();
-    CHECK_LT(idx, in_attrs->size());
-    storage_type_inputs[i] = in_attrs->at(idx);
-  }
-  CHECK_EQ(idx.outputs().size(), out_attrs->size());
-  exec::DevMaskVector dev_masks(idx.num_nodes(), dev_mask);
-  imperative::CheckAndInferStorageType(&grad_graph, std::move(dev_masks),
-                                       std::move(storage_type_inputs), true);
-
-  const auto& stypes = grad_graph.GetAttr<StorageTypeVector>("storage_type");
-  DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
-  auto &outputs = idx.outputs();
-  CHECK(outputs.size() == out_attrs->size());
-  for (size_t i = 0; i < out_attrs->size(); i++)
-    STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, i, stypes[idx.entry_id(outputs[i])]);
-  return true;
-}
-
 LoopState::LoopState(const Symbol &g) {
   this->subgraph_sym = g;
   this->subgraph.outputs = g.outputs;
diff --git a/src/operator/subgraph_op_common.h b/src/operator/subgraph_op_common.h
index 448742ac908c..578ddabfce8b 100644
--- a/src/operator/subgraph_op_common.h
+++ b/src/operator/subgraph_op_common.h
@@ -47,16 +47,6 @@ bool InferSubgraphStorage(const nnvm::Symbol &subgraph,
                           std::vector<int> *in_attrs,
                           std::vector<int> *out_attrs);
 
-/*
- * Infer the storage types of inputs and outputs of the backward computation of
- * an operator that contains a subgraph.
- */
-bool InferSubgraphBackwardStorage(const nnvm::Symbol &subgraph,
-                                  const int dev_mask,
-                                  DispatchMode* dispatch_mode,
-                                  std::vector<int> *in_attrs,
-                                  std::vector<int> *out_attrs);
-
 /*
  * This contains the states for running a loop and provides methods
  * of running the subgraph computation for an iteration.

From 1f8469f8fb55c1f7f3655d286580f8c2b0ac7d10 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 20 Jun 2018 22:21:00 -0700
Subject: [PATCH 109/135] fix scala API.

---
 .../src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala
index 9d51ddcb674a..0c80101e2506 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala
@@ -33,7 +33,7 @@ private[mxnet] object CToScalaUtils {
       case "double" | "doubleorNone" => "Double"
       case "string" => "String"
       case "boolean" | "booleanorNone" => "Boolean"
-      case "tupleof<float>" | "tupleof<double>" | "ptr" | "" => "Any"
+      case "tupleof<float>" | "tupleof<double>" | "tupleof<long>" | "ptr" | "" => "Any"
       case default => throw new IllegalArgumentException(
         s"Invalid type for args: $default, $argType")
     }

From 25e15a048c31d45652b31cdccaa97ae3f164a57e Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 20 Jun 2018 22:33:54 -0700
Subject: [PATCH 110/135] update comments.

---
 python/mxnet/symbol/contrib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 058bbb610177..54bda49a2a12 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -196,7 +196,7 @@ def check_data(inputs, in_type, msg):
     check_data(init_states, symbol.Symbol, "init_states should be a symbol or a list of symbols")
     not_state_list = isinstance(init_states, symbol.Symbol)
 
-    # TODO(zhengda) If the input python function references to the symbols outside
+    # If the input python function references to the symbols outside
     # the python function, we need to prune the computation graph constructed from
     # the function. One way of doing it is to mark the nodes in the computation graph
     # with AttrScope and prune the nodes without the special attribute.

From ff4eea0b756e86b3b525825f37605afdc32e703d Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Thu, 21 Jun 2018 00:16:15 -0700
Subject: [PATCH 111/135] fix scala.

---
 .../src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala
index 0c80101e2506..ca50a741012b 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala
@@ -33,7 +33,7 @@ private[mxnet] object CToScalaUtils {
       case "double" | "doubleorNone" => "Double"
       case "string" => "String"
       case "boolean" | "booleanorNone" => "Boolean"
-      case "tupleof<float>" | "tupleof<double>" | "tupleof<long>" | "ptr" | "" => "Any"
+      case "tupleof<float>" | "tupleof<double>" | "tupleof<>" | "ptr" | "" => "Any"
       case default => throw new IllegalArgumentException(
         s"Invalid type for args: $default, $argType")
     }

From b8aa62a918176db121e2ed0e43c4aa74debf372d Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Thu, 21 Jun 2018 00:17:53 -0700
Subject: [PATCH 112/135] fix test.

---
 tests/python/unittest/test_operator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 02a97e160485..1b0d45e5c4f9 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -6058,7 +6058,8 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
             all_ins.extend(frees)
             for i in range(len(all_ins)):
                 assert_almost_equal(all_ins[i].grad.asnumpy(),
-                        e.grad_arrays[gin_order[i]].asnumpy())
+                        e.grad_arrays[gin_order[i]].asnumpy(),
+                        rtol=0.001, atol=0.0001)
 
     # Test cases:
     # * graph inputs are stored in different orders.

From 64e4ff60ba5b62d3c2c38c0113977f6cc32cc566 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Thu, 21 Jun 2018 18:08:53 +0000
Subject: [PATCH 113/135] fix attribute name.

---
 python/mxnet/symbol/contrib.py | 2 +-
 src/c_api/c_api_symbolic.cc    | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 54bda49a2a12..22a128439040 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -200,7 +200,7 @@ def check_data(inputs, in_type, msg):
     # the python function, we need to prune the computation graph constructed from
     # the function. One way of doing it is to mark the nodes in the computation graph
     # with AttrScope and prune the nodes without the special attribute.
-    with AttrScope(subgraph_name=name):
+    with AttrScope(__subgraph_name__=name):
         if isinstance(data, list):
             in_eles = [symbol.var(sym.name) for sym in data]
         else:
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 524723be1e93..c27a59a67c6e 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -38,11 +38,10 @@ void RegisterLegacyOpProp();
 void RegisterLegacyNDFunc();
 }
 const std::vector<std::string> kHiddenKeys = {
-  "ctx_group", "lr_mult", "wd_mult", "force_mirroring", "mirror_stage", "subgraph_name"
+  "ctx_group", "lr_mult", "wd_mult", "force_mirroring", "mirror_stage"
 };
 const std::vector<std::string> kReplacedHiddenKeys = {
-  "__ctx_group__", "__lr_mult__", "__wd_mult__", "__force_mirroring__", "__mirror_stage__",
-  "subgraph_name"
+  "__ctx_group__", "__lr_mult__", "__wd_mult__", "__force_mirroring__", "__mirror_stage__"
 };
 const char *kNamespaceSeparator = "$";
 

From d243c12069de5354807baf092f170abe2136ab5c Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Thu, 21 Jun 2018 18:11:47 +0000
Subject: [PATCH 114/135] move benchmark.

---
 .../rnn/benchmark_rnn.py => benchmark/python/control_flow/rnn.py  | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename example/rnn/benchmark_rnn.py => benchmark/python/control_flow/rnn.py (100%)

diff --git a/example/rnn/benchmark_rnn.py b/benchmark/python/control_flow/rnn.py
similarity index 100%
rename from example/rnn/benchmark_rnn.py
rename to benchmark/python/control_flow/rnn.py

From 3afc4d4757151f05bc7c072ead73f54a61547535 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 22 Jun 2018 21:52:46 +0000
Subject: [PATCH 115/135] fix the mapping of operator inputs/outputs and
 subgraph inputs/outputs.

---
 python/mxnet/symbol/contrib.py     |  34 ++--
 src/operator/control_flow.cc       | 310 +++++++++++++++++++----------
 src/operator/subgraph_op_common.cc |  43 ++++
 src/operator/subgraph_op_common.h  |   8 +
 4 files changed, 276 insertions(+), 119 deletions(-)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 22a128439040..89735135acaf 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -254,32 +254,42 @@ def check_data(inputs, in_type, msg):
     cut_var_map = {sym.list_outputs()[0]:sym for sym in cut_syms}
     cut_var_names = cut_var_map.keys()
 
-    ordered_ins = []
-    in_state_locs = []
+    subg_input_names = g.list_inputs()
+    # ordered_ins contains input symbols in the following order:
+    # data_syms, state_syms, followed by cut_vars and vars in the closure.
+    ordered_ins = data_syms
+    ordered_ins.extend(init_states)
+
+    # this defines the location of data_syms in the list of subgraph inputs
     in_data_locs = []
-    for in_name in g.list_inputs():
+    for name in data_names:
+        in_data_locs.append(subg_input_names.index(name))
+
+    # this defines the location of state_syms in the list of subgraph inputs.
+    in_state_locs = []
+    for name in state_names:
+        in_state_locs.append(subg_input_names.index(name))
+
+    remain_locs = []
+    for in_name in subg_input_names:
         assert in_name in gin_names, "The input variable %s can't be found in graph inputs: %s" \
                 % (in_name, str(gin_names))
-        if in_name in state_names:
-            ordered_ins.append(states_map[in_name])
-            in_state_locs.append(len(ordered_ins) - 1)
-        elif in_name in data_names:
-            ordered_ins.append(data_map[in_name])
-            in_data_locs.append(len(ordered_ins) - 1)
-        elif in_name in cut_var_names:
+        if in_name in cut_var_names:
             ordered_ins.append(cut_var_map[in_name])
-        else:
+            remain_locs.append(subg_input_names.index(in_name))
+        elif in_name not in data_names and in_name not in state_names:
             # The remaining inputs are the variable nodes created inside the UDF.
             # The subgraph can't have nodes shared with the main graph. As such,
             # we need to make a copy of these variable nodes.
             assert in_name in gin_names
             ordered_ins.append(copy.deepcopy(input_syms[in_name]))
+            remain_locs.append(subg_input_names.index(in_name))
 
     num_outputs = len(flat_out)
     num_states = len(state_names)
     ret = symbol._internal._foreach(g, *ordered_ins, num_outputs=num_outputs,
                                     num_out_data=num_out_data, in_state_locs=in_state_locs,
-                                    in_data_locs=in_data_locs)
+                                    in_data_locs=in_data_locs, remain_locs=remain_locs)
     if num_outputs - num_states > 1:
         outs = []
         for i in range(num_outputs - num_states):
diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index 4b11d77150be..5b8e4cb6f6e8 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -36,8 +36,12 @@ struct ForeachParam : public dmlc::Parameter<ForeachParam> {
   int num_args;
   int num_outputs;
   int num_out_data;
+  // The location of states in the subgraph inputs.
   nnvm::Tuple<dim_t> in_state_locs;
+  // The location of data arrays in the subgraph inputs.
   nnvm::Tuple<dim_t> in_data_locs;
+  // The location of remaining arrays in the subgraph inputs.
+  nnvm::Tuple<dim_t> remain_locs;
   DMLC_DECLARE_PARAMETER(ForeachParam) {
     DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
     .describe("Number of inputs.");
@@ -49,6 +53,8 @@ struct ForeachParam : public dmlc::Parameter<ForeachParam> {
     .describe("The locations of loop states among the inputs.");
     DMLC_DECLARE_FIELD(in_data_locs)
     .describe("The locations of input data among the inputs.");
+    DMLC_DECLARE_FIELD(remain_locs)
+    .describe("The locations of remaining data among the inputs.");
   }
 };  // struct ForeachParam
 
@@ -73,12 +79,9 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
   const size_t iter_dim = 0;
   CHECK_EQ(outputs.size(), (size_t) params.num_outputs);
   CHECK_GT(params.in_data_locs.ndim(), 0);
-  size_t loc0 = params.in_data_locs[0];
-  size_t len = inputs[loc0].shape()[iter_dim];
-  for (size_t i = 1; i < params.in_data_locs.ndim(); i++) {
-    size_t loc = params.in_data_locs[i];
-    CHECK_EQ(inputs[loc].shape()[iter_dim], len);
-  }
+  size_t len = inputs[0].shape()[iter_dim];
+  for (size_t i = 1; i < params.in_data_locs.ndim(); i++)
+    CHECK_EQ(inputs[i].shape()[iter_dim], len);
   for (size_t i = 0; i < (size_t) params.num_out_data; i++)
     CHECK_EQ(len, outputs[i].shape()[iter_dim]);
   for (const auto &arr : outputs)
@@ -112,12 +115,13 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
 
   // Initialize the inputs for the subgraph.
   // In each iteration, we need to update the subgraph inputs for input data
-  // and the loop states. This initialization helps to get the read-only
-  // arrays in the loop.
+  // and the loop states.
   std::vector<NDArray> subg_inputs(inputs.size());
-  for (size_t i = 0; i < inputs.size(); i++) {
-    // These are the initial states.
-    subg_inputs[i] = inputs[i];
+  // The remaining arrays (other than input data and states) only need to be set once.
+  for (size_t j = 0; j < params.remain_locs.ndim(); j++) {
+    CHECK_LT(params.remain_locs[j], subg_inputs.size());
+    subg_inputs[params.remain_locs[j]] = inputs[j + params.in_data_locs.ndim()
+        + params.in_state_locs.ndim()];
   }
 
   // Here we iterate over the first dimension of the first input array.
@@ -144,9 +148,9 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
     // Get a slice from the input data arrays.
     for (size_t j = 0; j < params.in_data_locs.ndim(); j++) {
       size_t loc = params.in_data_locs[j];
-      subg_inputs[loc] = inputs[loc].At(i);
+      subg_inputs[loc] = inputs[j].At(i);
     }
-    // For the rest of the iterations, the rest of the arguments are the outputs
+    // For the rest of the iterations, the states are the outputs
     // from the previous iteration.
     if (i > 0) {
       for (size_t j = params.num_out_data; j < subg_out_prev->size(); j++) {
@@ -154,6 +158,11 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
         CHECK_LT(params.in_state_locs[idx], subg_inputs.size());
         subg_inputs[params.in_state_locs[idx]] = (*subg_out_prev)[j];
       }
+    } else {
+      for (size_t j = 0; j < params.in_state_locs.ndim(); j++) {
+        CHECK_LT(params.in_state_locs[j], subg_inputs.size());
+        subg_inputs[params.in_state_locs[j]] = inputs[j + params.in_data_locs.ndim()];
+      }
     }
 
     state.Forward(i, subg_inputs, req, *subg_out_curr, ctx.need_grad);
@@ -173,144 +182,178 @@ static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
     CHECK_EQ(arr.storage_type(), kDefaultStorage)
         << "The for operator doesn't support the sparse format";
   size_t iter_dim = 0;
-  std::unordered_set<size_t> in_data_locs(params.in_data_locs.begin(),
-                                          params.in_data_locs.end());
-  std::unordered_set<size_t> in_state_locs(params.in_state_locs.begin(),
-                                           params.in_state_locs.end());
   // The inputs contain out gradients, inputs and outputs.
   int len = inputs[0].shape()[iter_dim];
   size_t num_output_data = params.num_out_data;
 
   // In backward computation, we need to run iterations from backwards.
-  std::vector<NDArray> ograds(params.num_outputs);
-  std::vector<NDArray> igrads(outputs.size());
-  for (size_t i = num_output_data; i < ograds.size(); i++)
-    ograds[i] = inputs[i];
-  std::vector<OpReqType> iter_req(req.size());
+  std::vector<NDArray> subg_ograds(params.num_outputs);
+  std::vector<NDArray> subg_igrads = outputs;
+  for (size_t i = num_output_data; i < subg_ograds.size(); i++)
+    subg_ograds[i] = inputs[i];
+  std::vector<OpReqType> subg_req;
   for (auto r : req)
     CHECK_NE(r, kWriteInplace);
+
+  // There are three types of arrays in igrads.
+  // * data gradients.
+  // * loop variable gradients.
+  // * remaining variable gradients.
+  // They are in the following order:
+  // [data vars], [loop vars], [remaining vars]
+
+  // [remaining vars]
+  for (size_t i = 0; i < params.remain_locs.ndim(); i++) {
+    size_t loc = params.remain_locs[i];
+    subg_igrads[loc] = outputs[i + params.in_data_locs.ndim() + params.in_state_locs.ndim()];
+  }
+
   for (int iter_num = len - 1; iter_num >= 0; iter_num--) {
     for (int i = 0; i < params.num_out_data; i++)
-      ograds[i] = inputs[i].At(iter_num);
-
-    // There are three types of arrays in igrads.
-    // * data gradients.
-    // * loop variable gradients.
-    // * read-only variable gradients.
-    // These are the input data gradients.
-    for (size_t i = 0; i < igrads.size(); i++) {
-      // data gradients.
-      if (in_data_locs.count(i)) {
-        igrads[i] = outputs[i].At(iter_num);
-        iter_req[i] = req[i];
-        continue;
-      }
+      subg_ograds[i] = inputs[i].At(iter_num);
+    if (iter_num == len - 1) {
+      subg_req = req;
+    } else {
+      subg_req.clear();
+      subg_req.resize(req.size(), kAddTo);
+    }
 
-      bool in_state = in_state_locs.count(i);
-      if (iter_num != 0 && in_state) {
+    // [data vars]
+    for (size_t i = 0; i < params.in_data_locs.ndim(); i++) {
+      size_t loc = params.in_data_locs[i];
+      subg_igrads[loc] = outputs[i].At(iter_num);
+      subg_req[loc] = req[i];
+    }
+    // [loop vars]
+    for (size_t i = 0; i < params.in_state_locs.ndim(); i++) {
+      size_t loc = params.in_state_locs[i];
+      const NDArray &output = outputs[i + params.in_data_locs.ndim()];
+      if (iter_num != 0) {
         // For state gradients, we need to allocate new NDArrays
         // because intermediate state gradients won't be returned to the users.
-        igrads[i] = NDArray(outputs[i].shape(), outputs[i].ctx(),
-                            true, outputs[i].dtype());
+        subg_igrads[loc] = NDArray(output.shape(), output.ctx(), true, output.dtype());
       } else {
-        igrads[i] = outputs[i];
+        subg_igrads[loc] = output;
       }
-      if (in_state)
-        // For the first iteration, we need to use the request provided by
-        // the user to write state gradients to the outputs.
-        iter_req[i] = iter_num != 0 ? kWriteTo : req[i];
-      else
-        // For all read-only variable gradients, we need to use the request
-        // provided by the user in the last iteration and later on add gradients
-        // to the output arrays.
-        iter_req[i] = iter_num == len - 1 ? req[i]: kAddTo;
+      // For the first iteration, we need to use the request provided by
+      // the user to write state gradients to the outputs.
+      subg_req[loc] = iter_num != 0 ? kWriteTo : req[i + params.in_data_locs.ndim()];
     }
 
-    state.Backward(iter_num, ograds, iter_req, igrads);
+    state.Backward(iter_num, subg_ograds, subg_req, subg_igrads);
 
-    size_t num_states = ograds.size() - num_output_data;
+    size_t num_states = subg_ograds.size() - num_output_data;
     for (size_t i = 0; i < num_states; i++) {
       size_t loc = params.in_state_locs[i];
-      CHECK_LT(loc, igrads.size());
-      ograds[i + num_output_data] = igrads[loc];
+      CHECK_LT(loc, subg_igrads.size());
+      subg_ograds[i + num_output_data] = subg_igrads[loc];
     }
   }
   state.Cleanup();
 }
 
+template<typename T>
+static void remap(const std::vector<T> &op_in, size_t start,
+                  const nnvm::Tuple<dim_t> &locs, std::vector<T> *subg_in) {
+  auto op_in_it = op_in.begin() + start;
+  for (size_t i = 0; i < locs.ndim(); i++) {
+    dim_t loc = locs[i];
+    subg_in->at(loc) = *(op_in_it + i);
+  }
+}
+
 static bool ForeachShape(const nnvm::NodeAttrs& attrs,
                          std::vector<TShape> *in_shape,
                          std::vector<TShape> *out_shape) {
   const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
   CHECK_EQ(out_shape->size(), (size_t) params.num_outputs);
   CHECK_EQ(attrs.subgraphs.size(), 1U);
-  nnvm::Graph g;
-  g.outputs = attrs.subgraphs[0]->outputs;
-  const auto& idx = g.indexed_graph();
-  CHECK_EQ(idx.input_nodes().size(), in_shape->size());
-  CHECK_EQ(idx.outputs().size(), out_shape->size());
-
-  // Put the input and output shapes to the shape vector.
-  nnvm::ShapeVector shapes(idx.num_node_entries());
-  const auto &input_nids = idx.input_nodes();
-  CHECK_EQ(input_nids.size(), in_shape->size());
-  for (size_t i = 0; i < in_shape->size(); i++) {
-    auto eid = idx.entry_id(input_nids[i], 0);
-    shapes[eid] = in_shape->at(i);
-  }
-  CHECK_EQ(g.outputs.size(), out_shape->size());
-  for (size_t i = 0; i < out_shape->size(); i++) {
-    auto eid = idx.entry_id(g.outputs[i]);
-    shapes[eid] = out_shape->at(i);
-  }
-  // foreach iterates over the first input NDArray over the first dimension.
-  size_t loc0 = params.in_data_locs[0];
-  size_t len = in_shape->at(loc0)[0];
+
+  std::vector<TShape> subg_in_shape(in_shape->size());
+  // data shape
   for (size_t i = 0; i < params.in_data_locs.ndim(); i++) {
     size_t loc = params.in_data_locs[i];
-    auto eid = idx.entry_id(input_nids[loc], 0);
-    CHECK_EQ(len, in_shape->at(loc)[0]);
-    shapes[eid] = TShape(in_shape->at(loc).begin() + 1, in_shape->at(loc).end());
+    subg_in_shape[loc] = TShape(in_shape->at(i).begin() + 1, in_shape->at(i).end());
   }
-
-  // Infer shape of the graph.
-  g.attrs["shape"] = std::make_shared<dmlc::any>(std::move(shapes));
-  g = exec::InferShape(std::move(g));
-
-  const auto& shapes1 = g.GetAttr<nnvm::ShapeVector>("shape");
-  // Inferring the shape in the subgraph may infer the shape of the inputs.
-  // We need to copy the inferred input shapes back.
-  CHECK_EQ(input_nids.size(), in_shape->size());
-  for (size_t i = 0; i < in_shape->size(); i++) {
-    auto eid = idx.entry_id(input_nids[i], 0);
-    // If the input shape is none, we should update them.
-    if ((*in_shape)[i].ndim() == 0 || (*in_shape)[i].Size() == 0)
-      SHAPE_ASSIGN_CHECK(*in_shape, i, shapes1[eid]);
+  // state shape
+  remap(*in_shape, params.in_data_locs.ndim(), params.in_state_locs,
+        &subg_in_shape);
+  // remaining shape
+  remap(*in_shape, params.in_data_locs.ndim() + params.in_state_locs.ndim(),
+        params.remain_locs, &subg_in_shape);
+
+  std::vector<TShape> subg_out_shape = *out_shape;
+  for (int i = 0; i < params.num_out_data; i++) {
+    TShape shape = subg_out_shape[i];
+    // If we don't have shape info, we don't need to do anything.
+    if (shape.ndim() == 0)
+      continue;
+    subg_out_shape[i] = TShape(shape.begin() + 1, shape.end());
   }
 
+  bool infer_success = InferSubgraphShape(*attrs.subgraphs[0],
+                                          &subg_in_shape, &subg_out_shape);
+
+  // After inference, we need to move inferred information back to in_shape and
+  // out_shape.
+
   // For the shape of output data.
+  size_t len = in_shape->at(0)[0];
+  CHECK_GT(len, 0);
   for (int i = 0; i < params.num_out_data; i++) {
-    uint32_t eid = idx.entry_id(g.outputs[i]);
-    const auto& g_out_shape = shapes1[eid];
+    // If the output shape isn't inferred, we don't need to propogate the info.
+    const auto& g_out_shape = subg_out_shape[i];
+    if (g_out_shape.ndim() == 0)
+      continue;
+
     auto out = TShape(g_out_shape.ndim() + 1);
     out[0] = len;
     for (size_t i = 1; i < out.ndim(); i++)
       out[i] = g_out_shape[i - 1];
     SHAPE_ASSIGN_CHECK(*out_shape, i, out);
   }
+  // For the shape of output states.
+  for (size_t i = params.num_out_data; i < subg_out_shape.size(); i++)
+    SHAPE_ASSIGN_CHECK(*out_shape, i, subg_out_shape[i]);
 
-  // For the remaining shapes.
-  for (size_t i = params.num_out_data; i < g.outputs.size(); i++) {
-    uint32_t eid = idx.entry_id(g.outputs[i]);
-    SHAPE_ASSIGN_CHECK(*out_shape, i, shapes1[eid]);
+  // For the shape of input data.
+  for (size_t i = 0; i < params.in_data_locs.ndim(); i++) {
+    size_t loc = params.in_data_locs[i];
+    const auto &shape = subg_in_shape[loc];
+    // If the input data shape isn't inferred, we don't need to propogate the
+    // info.
+    if (shape.ndim() == 0)
+      continue;
+
+    auto in = TShape(shape.ndim() + 1);
+    in[0] = len;
+    for (size_t i = 1; i < in.ndim(); i++)
+      in[i] = shape[i - 1];
+    SHAPE_ASSIGN_CHECK(*in_shape, i, in);
   }
-  size_t num_states = g.outputs.size() - params.num_out_data;
-  for (size_t i = 0; i < num_states; i++) {
+  // For the shape of state.
+  for (size_t i = 0; i < params.in_state_locs.ndim(); i++) {
     size_t loc = params.in_state_locs[i];
-    CHECK((*out_shape)[i + params.num_out_data] == (*in_shape)[loc]);
+    SHAPE_ASSIGN_CHECK(*in_shape, i + params.in_data_locs.ndim(),
+                       subg_in_shape[loc]);
+  }
+  // For the shape of remaining data.
+  for (size_t i = 0; i < params.remain_locs.ndim(); i++) {
+    size_t loc = params.remain_locs[i];
+    SHAPE_ASSIGN_CHECK(*in_shape,
+                       i + params.in_data_locs.ndim() + params.in_state_locs.ndim(),
+                       subg_in_shape[loc]);
+  }
+
+  if (infer_success) {
+    size_t num_states = out_shape->size() - params.num_out_data;
+    for (size_t i = 0; i < num_states; i++) {
+      CHECK_EQ((*out_shape)[i + params.num_out_data],
+               (*in_shape)[i + params.in_data_locs.ndim()]);
+    }
   }
   // Check if we have inferred the shapes correctly.
-  return g.GetAttr<size_t>("shape_num_unknown_nodes") == 0;
+  return infer_success;
 }
 
 static bool ForeachType(const nnvm::NodeAttrs& attrs,
@@ -318,7 +361,26 @@ static bool ForeachType(const nnvm::NodeAttrs& attrs,
   const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
   CHECK_EQ(out_type->size(), (size_t) params.num_outputs);
   CHECK_EQ(attrs.subgraphs.size(), 1U);
-  return InferSubgraphDataType(*attrs.subgraphs[0], in_type, out_type);
+  std::vector<int> subg_in_type(in_type->size(), 0);
+  remap(*in_type, 0, params.in_data_locs, &subg_in_type);
+  remap(*in_type, params.in_data_locs.ndim(), params.in_state_locs, &subg_in_type);
+  remap(*in_type, params.in_data_locs.ndim() + params.in_state_locs.ndim(),
+        params.remain_locs, &subg_in_type);
+  bool success = InferSubgraphDataType(*attrs.subgraphs[0], &subg_in_type, out_type);
+  for (size_t i = 0; i < params.in_data_locs.ndim(); i++) {
+    size_t loc = params.in_data_locs[i];
+    TYPE_ASSIGN_CHECK(*in_type, i, subg_in_type[loc]);
+  }
+  for (size_t i = 0; i < params.in_state_locs.ndim(); i++) {
+    size_t loc = params.in_state_locs[i];
+    TYPE_ASSIGN_CHECK(*in_type, i + params.in_data_locs.ndim(), subg_in_type[loc]);
+  }
+  for (size_t i = 0; i < params.remain_locs.ndim(); i++) {
+    size_t loc = params.remain_locs[i];
+    TYPE_ASSIGN_CHECK(*in_type, i + params.in_data_locs.ndim() + params.in_state_locs.ndim(),
+                      subg_in_type[loc]);
+  }
+  return success;
 }
 
 static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
@@ -329,8 +391,29 @@ static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
   const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
   CHECK_EQ(out_attrs->size(), (size_t) params.num_outputs);
   CHECK_EQ(attrs.subgraphs.size(), 1U);
-  return InferSubgraphStorage(*attrs.subgraphs[0], dev_mask,
-                              dispatch_mode, in_attrs, out_attrs);
+  std::vector<int> subg_in_attrs(in_attrs->size(), kUndefinedStorage);
+  remap(*in_attrs, 0, params.in_data_locs, &subg_in_attrs);
+  remap(*in_attrs, params.in_data_locs.ndim(), params.in_state_locs, &subg_in_attrs);
+  remap(*in_attrs, params.in_data_locs.ndim() + params.in_state_locs.ndim(),
+        params.remain_locs, &subg_in_attrs);
+  bool success = InferSubgraphStorage(*attrs.subgraphs[0], dev_mask,
+                                      dispatch_mode, &subg_in_attrs, out_attrs);
+  for (size_t i = 0; i < params.in_data_locs.ndim(); i++) {
+    size_t loc = params.in_data_locs[i];
+    STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, i, subg_in_attrs[loc]);
+  }
+  for (size_t i = 0; i < params.in_state_locs.ndim(); i++) {
+    size_t loc = params.in_state_locs[i];
+    STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, i + params.in_data_locs.ndim(),
+                              subg_in_attrs[loc]);
+  }
+  for (size_t i = 0; i < params.remain_locs.ndim(); i++) {
+    size_t loc = params.remain_locs[i];
+    STORAGE_TYPE_ASSIGN_CHECK(*in_attrs,
+                              i + params.in_data_locs.ndim() + params.in_state_locs.ndim(),
+                              subg_in_attrs[loc]);
+  }
+  return success;
 }
 
 static bool BackwardForeachStorageType(const nnvm::NodeAttrs& attrs,
@@ -340,11 +423,24 @@ static bool BackwardForeachStorageType(const nnvm::NodeAttrs& attrs,
                                        std::vector<int> *out_attrs) {
   const ForeachParam& params = nnvm::get<ForeachParam>(attrs.parsed);
   CHECK_EQ(out_attrs->size(), (size_t) params.num_args - 1);
+  CHECK_EQ(in_attrs->size(), (size_t) params.num_args - 1 + params.num_outputs * 2);
   CHECK_EQ(attrs.subgraphs.size(), 1U);
   CachedOp op(*attrs.subgraphs[0],
               std::vector<std::pair<std::string, std::string> >());
+  // map the operator inputs to the subgraph inputs.
+  std::vector<int> subg_forward_ins(params.num_args - 1, kUndefinedStorage);
+  remap(*in_attrs, params.num_outputs, params.in_data_locs, &subg_forward_ins);
+  remap(*in_attrs, params.num_outputs + params.in_data_locs.ndim(),
+        params.in_state_locs, &subg_forward_ins);
+  remap(*in_attrs, params.num_outputs + params.in_data_locs.ndim() + params.in_state_locs.ndim(),
+        params.remain_locs, &subg_forward_ins);
+
+  // Copy backward input storage to backward subgraph input storage.
+  std::vector<int> subg_in_attrs = *in_attrs;
+  for (size_t i = 0; i < subg_forward_ins.size(); i++)
+    subg_in_attrs[i + params.num_outputs] = subg_forward_ins[i];
   return op.BackwardStorageType(attrs, dev_mask, dispatch_mode,
-                                in_attrs, out_attrs);
+                                &subg_in_attrs, out_attrs);
 }
 
 static OpStatePtr CreateForeachState(const NodeAttrs& attrs,
diff --git a/src/operator/subgraph_op_common.cc b/src/operator/subgraph_op_common.cc
index d7b3de2866f2..55da55cd28d7 100644
--- a/src/operator/subgraph_op_common.cc
+++ b/src/operator/subgraph_op_common.cc
@@ -118,6 +118,49 @@ bool InferSubgraphStorage(const nnvm::Symbol &subgraph,
   return g.GetAttr<size_t>("storage_type_num_unknown_nodes") == 0;
 }
 
+bool InferSubgraphShape(const nnvm::Symbol &subgraph,
+                        std::vector<TShape> *in_shape,
+                        std::vector<TShape> *out_shape) {
+  nnvm::Graph g;
+  g.outputs = subgraph.outputs;
+  const auto& idx = g.indexed_graph();
+  CHECK_EQ(idx.input_nodes().size(), in_shape->size());
+  CHECK_EQ(idx.outputs().size(), out_shape->size());
+
+  // Put the input and output shapes to the shape vector.
+  nnvm::ShapeVector shapes(idx.num_node_entries());
+  const auto &input_nids = idx.input_nodes();
+  CHECK_EQ(input_nids.size(), in_shape->size());
+  for (size_t i = 0; i < in_shape->size(); i++) {
+    auto eid = idx.entry_id(input_nids[i], 0);
+    shapes[eid] = in_shape->at(i);
+  }
+  CHECK_EQ(g.outputs.size(), out_shape->size());
+  for (size_t i = 0; i < out_shape->size(); i++) {
+    auto eid = idx.entry_id(g.outputs[i]);
+    shapes[eid] = out_shape->at(i);
+  }
+
+  // Infer shape of the graph.
+  g.attrs["shape"] = std::make_shared<dmlc::any>(std::move(shapes));
+  g = exec::InferShape(std::move(g));
+
+  const auto& shapes1 = g.GetAttr<nnvm::ShapeVector>("shape");
+  // Inferring the shape in the subgraph may infer the shape of the inputs.
+  // We need to copy the inferred input shapes back.
+  CHECK_EQ(input_nids.size(), in_shape->size());
+  for (size_t i = 0; i < in_shape->size(); i++) {
+    auto eid = idx.entry_id(input_nids[i], 0);
+    SHAPE_ASSIGN_CHECK(*in_shape, i, shapes1[eid]);
+  }
+
+  for (size_t i = 0; i < g.outputs.size(); i++) {
+    uint32_t eid = idx.entry_id(g.outputs[i]);
+    SHAPE_ASSIGN_CHECK(*out_shape, i, shapes1[eid]);
+  }
+  return g.GetAttr<size_t>("shape_num_unknown_nodes") == 0;
+}
+
 LoopState::LoopState(const Symbol &g) {
   this->subgraph_sym = g;
   this->subgraph.outputs = g.outputs;
diff --git a/src/operator/subgraph_op_common.h b/src/operator/subgraph_op_common.h
index 578ddabfce8b..a2706bebc0d3 100644
--- a/src/operator/subgraph_op_common.h
+++ b/src/operator/subgraph_op_common.h
@@ -37,6 +37,14 @@ namespace op {
 bool InferSubgraphDataType(const nnvm::Symbol &subgraph, std::vector<int> *in_type,
                            std::vector<int> *out_type);
 
+/*
+ * Infer the shape of inputs and outputs of an operator that contains a
+ * subgraph.
+ */
+bool InferSubgraphShape(const nnvm::Symbol &subgraph,
+                        std::vector<TShape> *in_shape,
+                        std::vector<TShape> *out_shape);
+
 /*
  * Infer the storage types of inputs and outputs of an operator that contains a
  * subgraph.

From 62901fe65650bb1e3e042917a9d94403d579a22a Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sat, 23 Jun 2018 02:43:21 +0000
Subject: [PATCH 116/135] add tests for dtype/shape inference.

---
 tests/python/unittest/test_operator.py | 82 ++++++++++++++++++++++----
 1 file changed, 70 insertions(+), 12 deletions(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 1b0d45e5c4f9..711e226d16cc 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5944,6 +5944,7 @@ def test_foreach():
     v5 = mx.sym.var("v2")
     v6 = mx.sym.var("v3")
     v7 = mx.sym.var("v4")
+    v8 = mx.sym.var("v5")
 
     # This tests foreach with accumulation sum.
     def step1(in1, states, free):
@@ -5953,8 +5954,31 @@ def step2(in1, states, free):
         out = states[0] + in1 * 2 + free[0]
         return (out, [out])
     def step3(in1, states, free):
-        out = in1[0] + in1[1] + states[0] + states[1] + free[0]
+        out = in1[0] + in1[1] * 2 + states[0] + states[1] * 2 + free[0]
         return ([out, out * 2], [out * 2, out * 3])
+    def step4(in1, states, free):
+        out = in1[1] * 2 + states[0] + free[0] + states[1] * 2 + in1[0]
+        return ([out, out * 2], [out * 2, out * 3])
+    def step5(in1, states, free):
+        if isinstance(in1[0], mx.nd.NDArray):
+            out1 = mx.nd.broadcast_add(states[0] + free[1], in1[1] * 2)
+            out2 = mx.nd.broadcast_add(in1[0], free[0] + states[1] * 2)
+        else:
+            out1 = mx.sym.broadcast_add(states[0] + free[1], in1[1] * 2)
+            out2 = mx.sym.broadcast_add(in1[0], free[0] + states[1] * 2)
+        return ([out1, out2 * 2], [states[0] * 2, states[1] * 3])
+    def step6(in1, states, free):
+        if isinstance(in1[0], mx.nd.NDArray):
+            out1 = mx.nd.broadcast_add(states[0] + mx.nd.cast(free[1], 'float32'),
+                    mx.nd.cast(in1[1], 'float32') * 2)
+            out2 = mx.nd.broadcast_add(in1[0],
+                    free[0] + mx.nd.cast(states[1], 'float32') * 2)
+        else:
+            out1 = mx.sym.broadcast_add(states[0] + mx.sym.cast(free[1], 'float32'),
+                    mx.sym.cast(in1[1], 'float32') * 2)
+            out2 = mx.sym.broadcast_add(in1[0],
+                    free[0] + mx.sym.cast(states[1], 'float32') * 2)
+        return ([out1, out2 * 2], [states[0] * 2, states[1] * 3])
 
     def verify_foreach(step, in_syms, state_syms, free_syms,
             in_arrs, init_states, frees, out_grads, is_train=True,
@@ -5962,7 +5986,8 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
         step_sym = lambda in_syms, state_syms : step(in_syms, state_syms, free_syms)
         res, states = mx.sym.contrib.foreach(step_sym, in_syms, state_syms)
         out = _as_list(res)
-        for i in range(len(out)):
+        num_outputs = len(out)
+        for i in range(num_outputs):
             out[i] = out[i] * 2
         out.extend(states)
         out = mx.sym.Group(out)
@@ -6041,12 +6066,13 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
                 outs.append(states)
                 states = mx.nd.expand_dims(states, 0)
                 res2.append(states)
-            res = mx.nd.concat(*res2, dim=0)
+            if is_train:
+                res = mx.nd.concat(*res2, dim=0)
 
         tmp_grads = out_grads[0][:]
         tmp_grads1 = [mx.nd.expand_dims(grad, 0) for grad in out_grads[1]]
         tmp_grads.extend(tmp_grads1)
-        if (is_train):
+        if is_train:
             res.backward(mx.nd.concat(*tmp_grads, dim=0))
         for i in range(len(outs)):
             assert e.outputs[i].shape == outs[i].shape
@@ -6121,6 +6147,36 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
     verify_foreach(step3, [v3, v4], [v5, v6], [v7], arrs, states, frees, out_grads)
     verify_foreach(step3, [v3, v4], [v5, v6], [v7], arrs, states, frees, out_grads, False)
 
+    # Test multiple inputs and outputs.
+    # The order of subgraph inputs doesn't match the operator inputs
+    arrs = [mx.nd.random.uniform(shape=(3, 2)), mx.nd.random.uniform(shape=(3, 2))]
+    states = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2))]
+    out_grads = [[mx.nd.random.uniform(-10, 10, arrs[0].shape), mx.nd.random.uniform(-10, 10, arrs[1].shape)],
+            [mx.nd.random.uniform(-10, 10, states[0].shape), mx.nd.random.uniform(-10, 10, states[1].shape)]]
+    verify_foreach(step4, [v3, v4], [v5, v6], [v7], arrs, states, frees, out_grads)
+    verify_foreach(step4, [v3, v4], [v5, v6], [v7], arrs, states, frees, out_grads, False)
+
+    # Test multiple inputs and outputs.
+    # The data inputs and states have different shapes.
+    frees = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2, 2))]
+    arrs = [mx.nd.random.uniform(shape=(3, 2, 2)), mx.nd.random.uniform(shape=(3, 2))]
+    states = [mx.nd.random.uniform(shape=(2, 2)), mx.nd.random.uniform(shape=(2))]
+    out_grads = [[mx.nd.random.uniform(-10, 10, arrs[0].shape), mx.nd.random.uniform(-10, 10, arrs[0].shape)],
+            [mx.nd.random.uniform(-10, 10, states[0].shape), mx.nd.random.uniform(-10, 10, states[1].shape)]]
+    verify_foreach(step5, [v3, v4], [v5, v6], [v7, v8], arrs, states, frees, out_grads, False)
+
+    # Test multiple inputs and outputs.
+    # The data inputs and states have different shapes and data types.
+    frees = [mx.nd.random.uniform(shape=(2)),
+            mx.nd.cast(mx.nd.random.uniform(shape=(2, 2)), 'float64')]
+    arrs = [mx.nd.random.uniform(shape=(3, 2, 2)),
+            mx.nd.cast(mx.nd.random.uniform(shape=(3, 2)), dtype='float16')]
+    states = [mx.nd.random.uniform(shape=(2, 2)),
+            mx.nd.cast(mx.nd.random.uniform(shape=(2)), dtype='int32')]
+    out_grads = [[mx.nd.random.uniform(-10, 10, arrs[0].shape), mx.nd.random.uniform(-10, 10, arrs[0].shape)],
+            [mx.nd.random.uniform(-10, 10, states[0].shape), mx.nd.random.uniform(-10, 10, states[1].shape)]]
+    verify_foreach(step6, [v3, v4], [v5, v6], [v7, v8], arrs, states, frees, out_grads, False)
+
 
 @with_seed()
 def test_foreach_nested():
@@ -6218,8 +6274,8 @@ def sym_group(out):
             'i2h_weight': i2h_warr, 'h2h_weight': h2h_warr,
             'i2h_bias': i2h_barr, 'h2h_bias': h2h_barr}
     args2 = {'data': data_arr, 'h': h_arr, 'c': c_arr,
-            'mylstm_i2h_weight': i2h_warr, 'mylstm_h2h_weight': h2h_warr,
-            'mylstm_i2h_bias': i2h_barr, 'mylstm_h2h_bias': h2h_barr}
+            'i2h_weight': i2h_warr, 'h2h_weight': h2h_warr,
+            'i2h_bias': i2h_barr, 'h2h_bias': h2h_barr}
 
     # gradients for the backward of the foreach symbol
     data_arr_grad1 = mx.nd.empty(data_arr.shape)
@@ -6242,8 +6298,8 @@ def sym_group(out):
     i2h_barr_grad2 = mx.nd.empty(i2h_barr.shape)
     h2h_barr_grad2 = mx.nd.empty(h2h_barr.shape)
     args_grad2 = {'data': data_arr_grad2, 'h': h_arr_grad2, 'c': c_arr_grad2,
-            'mylstm_i2h_weight': i2h_warr_grad2, 'mylstm_h2h_weight': h2h_warr_grad2,
-            'mylstm_i2h_bias': i2h_barr_grad2, 'mylstm_h2h_bias': h2h_barr_grad2}
+            'i2h_weight': i2h_warr_grad2, 'h2h_weight': h2h_warr_grad2,
+            'i2h_bias': i2h_barr_grad2, 'h2h_bias': h2h_barr_grad2}
 
     # Symbol of running LSTM with foreach.
     out = mx.sym.contrib.foreach(step, data, [init_h, init_c])
@@ -6255,7 +6311,7 @@ def sym_group(out):
     e1 = out.bind(ctx=default_context(), args=args1, args_grad=args_grad1)
 
     # Symbol of running unrolled LSTM.
-    lstm = mx.rnn.LSTMCell(4, prefix='mylstm_')
+    lstm = mx.rnn.LSTMCell(4, prefix='')
     h = init_h
     c = init_c
     unroll_outs = []
@@ -6290,16 +6346,18 @@ def sym_group(out):
         e1.backward(out_grads)
 
         e2.forward(is_train=True, data = data_arr, h = h_arr, c = c_arr,
-            mylstm_i2h_weight = i2h_warr, mylstm_h2h_weight = h2h_warr,
-            mylstm_i2h_bias = i2h_barr, mylstm_h2h_bias = h2h_barr)
+            i2h_weight = i2h_warr, h2h_weight = h2h_warr,
+            i2h_bias = i2h_barr, h2h_bias = h2h_barr)
         outputs2 = e2.outputs
         e2.backward(out_grads)
 
         for i in range(len(outputs2)):
             assert_almost_equal(outputs1[i].asnumpy(), outputs2[i].asnumpy(),
                     rtol=0.001, atol=0.0001)
+        input_names = out.list_inputs()
         for i in range(len(e1.grad_arrays)):
-            assert_almost_equal(e1.grad_arrays[i].asnumpy(), e2.grad_arrays[i].asnumpy())
+            name = input_names[i]
+            assert_almost_equal(args_grad1[name].asnumpy(), args_grad2[name].asnumpy())
 
 
 @with_seed()

From 14b8fb9058d943aebc5eda6d77e7c1dca60fc5c9 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sat, 23 Jun 2018 07:20:07 +0000
Subject: [PATCH 117/135] reorganize tests.

---
 tests/python/unittest/test_operator.py | 81 ++++++++++++--------------
 1 file changed, 38 insertions(+), 43 deletions(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 711e226d16cc..21c6b476641a 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5946,40 +5946,6 @@ def test_foreach():
     v7 = mx.sym.var("v4")
     v8 = mx.sym.var("v5")
 
-    # This tests foreach with accumulation sum.
-    def step1(in1, states, free):
-        out = in1 * 2 + states[0] + free[0]
-        return (out, [out])
-    def step2(in1, states, free):
-        out = states[0] + in1 * 2 + free[0]
-        return (out, [out])
-    def step3(in1, states, free):
-        out = in1[0] + in1[1] * 2 + states[0] + states[1] * 2 + free[0]
-        return ([out, out * 2], [out * 2, out * 3])
-    def step4(in1, states, free):
-        out = in1[1] * 2 + states[0] + free[0] + states[1] * 2 + in1[0]
-        return ([out, out * 2], [out * 2, out * 3])
-    def step5(in1, states, free):
-        if isinstance(in1[0], mx.nd.NDArray):
-            out1 = mx.nd.broadcast_add(states[0] + free[1], in1[1] * 2)
-            out2 = mx.nd.broadcast_add(in1[0], free[0] + states[1] * 2)
-        else:
-            out1 = mx.sym.broadcast_add(states[0] + free[1], in1[1] * 2)
-            out2 = mx.sym.broadcast_add(in1[0], free[0] + states[1] * 2)
-        return ([out1, out2 * 2], [states[0] * 2, states[1] * 3])
-    def step6(in1, states, free):
-        if isinstance(in1[0], mx.nd.NDArray):
-            out1 = mx.nd.broadcast_add(states[0] + mx.nd.cast(free[1], 'float32'),
-                    mx.nd.cast(in1[1], 'float32') * 2)
-            out2 = mx.nd.broadcast_add(in1[0],
-                    free[0] + mx.nd.cast(states[1], 'float32') * 2)
-        else:
-            out1 = mx.sym.broadcast_add(states[0] + mx.sym.cast(free[1], 'float32'),
-                    mx.sym.cast(in1[1], 'float32') * 2)
-            out2 = mx.sym.broadcast_add(in1[0],
-                    free[0] + mx.sym.cast(states[1], 'float32') * 2)
-        return ([out1, out2 * 2], [states[0] * 2, states[1] * 3])
-
     def verify_foreach(step, in_syms, state_syms, free_syms,
             in_arrs, init_states, frees, out_grads, is_train=True,
             free_vars_func=None, num_iters=1):
@@ -6094,15 +6060,12 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
     # * the number of iterations: odd or even.
     # * multiple inputs and multiple outputs.
     # * inference.
-
-    #states = [mx.nd.random.uniform(shape=(2))]
-
-    #frees1 = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2))]
-    #arrs = mx.nd.random.uniform(shape=(3, 2))
-    states = [mx.nd.arange(2)]
-
+    def step1(in1, states, free):
+        out = in1 * 2 + states[0] + free[0]
+        return (out, [out])
     frees1 = [mx.nd.arange(2), mx.nd.arange(2) + 1]
     arrs = mx.nd.arange(6).reshape(shape=(3, 2))
+    states = [mx.nd.arange(2)]
     out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],
             [mx.nd.random.uniform(-10, 10, states[0].shape)]]
     verify_foreach(step1, v3, [v4], [v5 + v6], arrs, states, frees1, out_grads, True,
@@ -6114,25 +6077,31 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
     verify_foreach(step1, v3, [v4], [v5 + v6], arrs, states, frees1, out_grads, False,
             lambda frees : [frees[0] + frees[1]], 5)
 
+    # Test the even number of iterations.
     frees = [mx.nd.random.uniform(shape=(2))]
     arrs = mx.nd.random.uniform(shape=(2, 2))
     out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],
             [mx.nd.random.uniform(-10, 10, states[0].shape)]]
     verify_foreach(step1, v3, [v4], [v5], arrs, states, frees, out_grads)
     verify_foreach(step1, v3, [v4], [v5], arrs, states, frees, out_grads, False)
-
+    # Test the odd number of iterations
     arrs = mx.nd.random.uniform(shape=(3, 2))
     out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],
             [mx.nd.random.uniform(-10, 10, states[0].shape)]]
     verify_foreach(step1, v3, [v4], [v5], arrs, states, frees, out_grads)
     verify_foreach(step1, v3, [v4], [v5], arrs, states, frees, out_grads, False)
 
+    # Reorder the input and state in the subgraph inputs.
+    def step2(in1, states, free):
+        out = states[0] + in1 * 2 + free[0]
+        return (out, [out])
+    # Test the even number of iterations.
     arrs = mx.nd.random.uniform(shape=(2, 2))
     out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],
             [mx.nd.random.uniform(-10, 10, states[0].shape)]]
     verify_foreach(step2, v3, [v4], [v5], arrs, states, frees, out_grads)
     verify_foreach(step2, v3, [v4], [v5], arrs, states, frees, out_grads, False)
-
+    # Test the odd number of iterations.
     arrs = mx.nd.random.uniform(shape=(3, 2))
     out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],
             [mx.nd.random.uniform(-10, 10, states[0].shape)]]
@@ -6140,6 +6109,9 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
     verify_foreach(step2, v3, [v4], [v5], arrs, states, frees, out_grads, False)
 
     # Test multiple inputs and outputs.
+    def step3(in1, states, free):
+        out = in1[0] + in1[1] * 2 + states[0] + states[1] * 2 + free[0]
+        return ([out, out], [out * 2, out * 3])
     arrs = [mx.nd.random.uniform(shape=(3, 2)), mx.nd.random.uniform(shape=(3, 2))]
     states = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2))]
     out_grads = [[mx.nd.random.uniform(-10, 10, arrs[0].shape), mx.nd.random.uniform(-10, 10, arrs[1].shape)],
@@ -6149,6 +6121,9 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
 
     # Test multiple inputs and outputs.
     # The order of subgraph inputs doesn't match the operator inputs
+    def step4(in1, states, free):
+        out = in1[1] * 2 + states[0] + free[0] + states[1] * 2 + in1[0]
+        return ([out, out * 2], [out * 2, out * 3])
     arrs = [mx.nd.random.uniform(shape=(3, 2)), mx.nd.random.uniform(shape=(3, 2))]
     states = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2))]
     out_grads = [[mx.nd.random.uniform(-10, 10, arrs[0].shape), mx.nd.random.uniform(-10, 10, arrs[1].shape)],
@@ -6158,6 +6133,14 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
 
     # Test multiple inputs and outputs.
     # The data inputs and states have different shapes.
+    def step5(in1, states, free):
+        if isinstance(in1[0], mx.nd.NDArray):
+            out1 = mx.nd.broadcast_add(states[0] + free[1], in1[1] * 2)
+            out2 = mx.nd.broadcast_add(in1[0], free[0] + states[1] * 2)
+        else:
+            out1 = mx.sym.broadcast_add(states[0] + free[1], in1[1] * 2)
+            out2 = mx.sym.broadcast_add(in1[0], free[0] + states[1] * 2)
+        return ([out1, out2 * 2], [states[0] * 2, states[1] * 3])
     frees = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2, 2))]
     arrs = [mx.nd.random.uniform(shape=(3, 2, 2)), mx.nd.random.uniform(shape=(3, 2))]
     states = [mx.nd.random.uniform(shape=(2, 2)), mx.nd.random.uniform(shape=(2))]
@@ -6167,6 +6150,18 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
 
     # Test multiple inputs and outputs.
     # The data inputs and states have different shapes and data types.
+    def step6(in1, states, free):
+        if isinstance(in1[0], mx.nd.NDArray):
+            out1 = mx.nd.broadcast_add(states[0] + mx.nd.cast(free[1], 'float32'),
+                    mx.nd.cast(in1[1], 'float32') * 2)
+            out2 = mx.nd.broadcast_add(in1[0],
+                    free[0] + mx.nd.cast(states[1], 'float32') * 2)
+        else:
+            out1 = mx.sym.broadcast_add(states[0] + mx.sym.cast(free[1], 'float32'),
+                    mx.sym.cast(in1[1], 'float32') * 2)
+            out2 = mx.sym.broadcast_add(in1[0],
+                    free[0] + mx.sym.cast(states[1], 'float32') * 2)
+        return ([out1, out2 * 2], [states[0] * 2, states[1] * 3])
     frees = [mx.nd.random.uniform(shape=(2)),
             mx.nd.cast(mx.nd.random.uniform(shape=(2, 2)), 'float64')]
     arrs = [mx.nd.random.uniform(shape=(3, 2, 2)),

From f7d7f178a34edbc674b7c12fabb62c094a7be294 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sat, 23 Jun 2018 18:49:34 +0000
Subject: [PATCH 118/135] fix a bug of cutting NodeEntry.

When two node entries refer to the same output of a node, we should
create only one var node for these two node entries.
---
 src/nnvm/graph_editor.cc               | 30 +++++++++++++++++++++++---
 tests/python/unittest/test_operator.py | 13 +++++++++++
 2 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/src/nnvm/graph_editor.cc b/src/nnvm/graph_editor.cc
index c23fc2569697..3167dfb51405 100644
--- a/src/nnvm/graph_editor.cc
+++ b/src/nnvm/graph_editor.cc
@@ -65,6 +65,16 @@ std::vector<nnvm::Symbol *> GetInputSymbols(const nnvm::Symbol &sym) {
  */
 bool CutGraphInputs(const std::vector<nnvm::NodeEntry *> &input_entries,
                     bool skip_var, std::vector<nnvm::NodeEntry> *orig_entries) {
+  struct pred_entry {
+    nnvm::NodeEntry e;
+    pred_entry(const nnvm::NodeEntry &_e): e(_e) {}
+    bool operator()(const nnvm::NodeEntry e1) {
+      return e.node == e1.node && e.index == e1.index;
+    }
+  };
+
+  std::vector<nnvm::NodePtr> var_nodes;
+  orig_entries->clear();
   orig_entries->reserve(input_entries.size());
   for (size_t i = 0; i < input_entries.size(); i++) {
     nnvm::NodeEntry *e = input_entries[i];
@@ -72,10 +82,24 @@ bool CutGraphInputs(const std::vector<nnvm::NodeEntry *> &input_entries,
     if (e->node->is_variable() && skip_var)
       continue;
 
+    auto it = std::find_if(orig_entries->begin(), orig_entries->end(),
+                           pred_entry(*e));
+    bool exist = (it != orig_entries->end());
     orig_entries->push_back(*e);
-    nnvm::Symbol sym;
-    sym.outputs.push_back(*e);
-    nnvm::NodePtr n = nnvm::CreateVariableNode(sym.ListOutputNames()[0]);
+    nnvm::NodePtr n;
+    // If we haven't seen the entry before, we need to create a new var node
+    // for the node entry.
+    if (!exist) {
+      nnvm::Symbol sym;
+      sym.outputs.push_back(*e);
+      n = nnvm::CreateVariableNode(sym.ListOutputNames()[0]);
+    } else {
+      // Otherwise, we use the var node created before.
+      size_t idx = it - orig_entries->begin();
+      CHECK_LT(idx, var_nodes.size());
+      n = var_nodes[idx];
+    }
+    var_nodes.push_back(n);
     *e = nnvm::NodeEntry{n, 0, 0};
   }
   return true;
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 21c6b476641a..928173b5d0ff 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -6172,6 +6172,19 @@ def step6(in1, states, free):
             [mx.nd.random.uniform(-10, 10, states[0].shape), mx.nd.random.uniform(-10, 10, states[1].shape)]]
     verify_foreach(step6, [v3, v4], [v5, v6], [v7, v8], arrs, states, frees, out_grads, False)
 
+    # Test multiple inputs and outputs.
+    # some of the inputs are used twice.
+    def step8(in1, states, free):
+        out1 = states[0] + in1[0] + free[1] + in1[1] * 2 + free[0]
+        out2 = in1[0] + free[0] + states[1] * 2 + in1[1]
+        return ([out1, out2 * 2], [states[0] * 2, states[1] * 3])
+    frees = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2))]
+    arrs = [mx.nd.random.uniform(shape=(3, 2)), mx.nd.random.uniform(shape=(3, 2))]
+    states = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2))]
+    out_grads = [[mx.nd.random.uniform(-10, 10, arrs[0].shape), mx.nd.random.uniform(-10, 10, arrs[0].shape)],
+            [mx.nd.random.uniform(-10, 10, states[0].shape), mx.nd.random.uniform(-10, 10, states[1].shape)]]
+    verify_foreach(step8, [v3, v4], [v5, v6], [v7, v8], arrs, states, frees, out_grads, False)
+
 
 @with_seed()
 def test_foreach_nested():

From 7d012d9373ea8860fa9e44253736bfe2ece8684b Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sat, 23 Jun 2018 19:00:03 +0000
Subject: [PATCH 119/135] fix lint error.

---
 src/nnvm/graph_editor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nnvm/graph_editor.cc b/src/nnvm/graph_editor.cc
index 3167dfb51405..1dee3c14ee44 100644
--- a/src/nnvm/graph_editor.cc
+++ b/src/nnvm/graph_editor.cc
@@ -67,7 +67,7 @@ bool CutGraphInputs(const std::vector<nnvm::NodeEntry *> &input_entries,
                     bool skip_var, std::vector<nnvm::NodeEntry> *orig_entries) {
   struct pred_entry {
     nnvm::NodeEntry e;
-    pred_entry(const nnvm::NodeEntry &_e): e(_e) {}
+    explicit pred_entry(const nnvm::NodeEntry &_e): e(_e) {}
     bool operator()(const nnvm::NodeEntry e1) {
       return e.node == e1.node && e.index == e1.index;
     }

From b83253d6dfcc6c7b1ab6596f0f73d92cdff7e3c8 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sun, 24 Jun 2018 00:52:24 +0000
Subject: [PATCH 120/135] handle the case that outputs are inputs.

---
 python/mxnet/symbol/contrib.py         | 31 ++++++++++++++++----------
 tests/python/unittest/test_operator.py | 20 +++++++++++++++--
 2 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 89735135acaf..f4d7e8f13846 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -220,19 +220,26 @@ def check_data(inputs, in_type, msg):
                     "the number of output states (%d) should be the same as input states (%d)" \
                     % (len(sym_states), len(init_states))
 
-        if isinstance(sym_out, list):
-            flat_out = sym_out
-        else:
-            flat_out = [sym_out]
+        sym_out = _as_list(sym_out)
+        flat_out = []
+        all_input_names = [i.name for i in in_eles] + [s.name for s in states]
+        output_names = [o.name for o in sym_out]
+        for o in sym_out:
+            if o.name in all_input_names:
+                flat_out.append(symbol.op.identity(o))
+            else:
+                flat_out.append(o)
         num_out_data = len(flat_out)
-        if isinstance(sym_states, list):
-            for s in sym_states:
+
+        sym_states = _as_list(sym_states)
+        for s in sym_states:
+            if s.name in all_input_names or s.name in output_names:
                 # There is a problem if the outputs are the same as the inputs
                 # or the first output. By calling identity, we can make sure that
                 # all symbols will refer to different NDArrays.
                 flat_out.append(symbol.op.identity(s))
-        else:
-            flat_out.append(symbol.op.identity(sym_states))
+            else:
+                flat_out.append(s)
         g = symbol.Group(flat_out)
 
     cut_syms = _cut_subgraph(g)
@@ -262,13 +269,13 @@ def check_data(inputs, in_type, msg):
 
     # this defines the location of data_syms in the list of subgraph inputs
     in_data_locs = []
-    for name in data_names:
-        in_data_locs.append(subg_input_names.index(name))
+    for dname in data_names:
+        in_data_locs.append(subg_input_names.index(dname))
 
     # this defines the location of state_syms in the list of subgraph inputs.
     in_state_locs = []
-    for name in state_names:
-        in_state_locs.append(subg_input_names.index(name))
+    for sname in state_names:
+        in_state_locs.append(subg_input_names.index(sname))
 
     remain_locs = []
     for in_name in subg_input_names:
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 928173b5d0ff..09831fcaef58 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -6174,7 +6174,7 @@ def step6(in1, states, free):
 
     # Test multiple inputs and outputs.
     # some of the inputs are used twice.
-    def step8(in1, states, free):
+    def step7(in1, states, free):
         out1 = states[0] + in1[0] + free[1] + in1[1] * 2 + free[0]
         out2 = in1[0] + free[0] + states[1] * 2 + in1[1]
         return ([out1, out2 * 2], [states[0] * 2, states[1] * 3])
@@ -6183,7 +6183,23 @@ def step8(in1, states, free):
     states = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2))]
     out_grads = [[mx.nd.random.uniform(-10, 10, arrs[0].shape), mx.nd.random.uniform(-10, 10, arrs[0].shape)],
             [mx.nd.random.uniform(-10, 10, states[0].shape), mx.nd.random.uniform(-10, 10, states[1].shape)]]
-    verify_foreach(step8, [v3, v4], [v5, v6], [v7, v8], arrs, states, frees, out_grads, False)
+    verify_foreach(step7, [v3, v4], [v5, v6], [v7, v8], arrs, states, frees, out_grads, False)
+
+    # Test the case that the output is the input.
+    # The output is one of the inputs.
+    arrs = mx.nd.random.uniform(shape=(3, 2))
+    states = [mx.nd.arange(2)]
+    frees = [mx.nd.random.uniform(shape=(2))]
+    out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],
+            [mx.nd.random.uniform(-10, 10, states[0].shape)]]
+    def step8(in1, states, free):
+        return (in1, [states[0] * free[0]])
+    verify_foreach(step8, v3, [v4], [v5], arrs, states, frees, out_grads)
+    verify_foreach(step8, v3, [v4], [v5], arrs, states, frees, out_grads, False)
+    def step9(in1, states, free):
+        return (in1 * free[0], states)
+    verify_foreach(step9, v3, [v4], [v5], arrs, states, frees, out_grads)
+    verify_foreach(step9, v3, [v4], [v5], arrs, states, frees, out_grads, False)
 
 
 @with_seed()

From 275bbf19f45647577a982c2caea171d076aa8fcb Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Sun, 24 Jun 2018 02:41:23 +0000
Subject: [PATCH 121/135] handle the case that inputs aren't used.

---
 python/mxnet/symbol/contrib.py         | 74 ++++++++++++++++----------
 tests/python/unittest/test_operator.py | 39 +++++++++++---
 2 files changed, 77 insertions(+), 36 deletions(-)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index f4d7e8f13846..c871e6c81a4a 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -124,6 +124,36 @@ def _cut_subgraph(subg):
         syms.append(s)
     return syms
 
+# This construct a subgraph for given output nodes.
+# If an output node is one of the input nodes, we call identity to make sure
+# that outputs nodes are different from input nodes.
+def construct_subgraph(sym_out, sym_states):
+    sym_out = _as_list(sym_out)
+    sym_states = _as_list(sym_states)
+    all_outputs = []
+    all_outputs.extend(sym_out)
+    all_outputs.extend(sym_states)
+    g = symbol.Group(all_outputs)
+
+    flat_out = []
+    all_input_names = g.list_inputs()
+    output_names = [o.name for o in sym_out]
+    for o in sym_out:
+        if o.name in all_input_names:
+            flat_out.append(symbol.op.identity(o))
+        else:
+            flat_out.append(o)
+
+    for s in sym_states:
+        if s.name in all_input_names or s.name in output_names:
+            # There is a problem if the outputs are the same as the inputs
+            # or the first output. By calling identity, we can make sure that
+            # all symbols will refer to different NDArrays.
+            flat_out.append(symbol.op.identity(s))
+        else:
+            flat_out.append(s)
+    return symbol.Group(flat_out)
+
 def foreach(body, data, init_states, name="foreach"):
     """Run a for loop with user-defined computation over Symbols on dimension 0.
 
@@ -219,29 +249,12 @@ def check_data(inputs, in_type, msg):
             assert isinstance(init_states, list) and len(sym_states) == len(init_states), \
                     "the number of output states (%d) should be the same as input states (%d)" \
                     % (len(sym_states), len(init_states))
+        num_out_data = len(sym_out)
+        num_states = len(sym_states)
+        num_outputs = num_out_data + num_states
+        g = construct_subgraph(sym_out, sym_states)
 
-        sym_out = _as_list(sym_out)
-        flat_out = []
-        all_input_names = [i.name for i in in_eles] + [s.name for s in states]
-        output_names = [o.name for o in sym_out]
-        for o in sym_out:
-            if o.name in all_input_names:
-                flat_out.append(symbol.op.identity(o))
-            else:
-                flat_out.append(o)
-        num_out_data = len(flat_out)
-
-        sym_states = _as_list(sym_states)
-        for s in sym_states:
-            if s.name in all_input_names or s.name in output_names:
-                # There is a problem if the outputs are the same as the inputs
-                # or the first output. By calling identity, we can make sure that
-                # all symbols will refer to different NDArrays.
-                flat_out.append(symbol.op.identity(s))
-            else:
-                flat_out.append(s)
-        g = symbol.Group(flat_out)
-
+    input_syms = _get_graph_inputs(g)
     cut_syms = _cut_subgraph(g)
     input_syms = _get_graph_inputs(g)
 
@@ -265,17 +278,24 @@ def check_data(inputs, in_type, msg):
     # ordered_ins contains input symbols in the following order:
     # data_syms, state_syms, followed by cut_vars and vars in the closure.
     ordered_ins = data_syms
-    ordered_ins.extend(init_states)
-
     # this defines the location of data_syms in the list of subgraph inputs
     in_data_locs = []
     for dname in data_names:
-        in_data_locs.append(subg_input_names.index(dname))
+        # Some data may not be used.
+        if dname in subg_input_names:
+            in_data_locs.append(subg_input_names.index(dname))
+        else:
+            raise AssertionError("the data arrays have to be used in the loop body")
 
+    ordered_ins.extend(init_states)
     # this defines the location of state_syms in the list of subgraph inputs.
     in_state_locs = []
     for sname in state_names:
-        in_state_locs.append(subg_input_names.index(sname))
+        # Some state may not be used.
+        if sname in subg_input_names:
+            in_state_locs.append(subg_input_names.index(sname))
+        else:
+            raise AssertionError("the state arrays have to be used in the loop body")
 
     remain_locs = []
     for in_name in subg_input_names:
@@ -292,8 +312,6 @@ def check_data(inputs, in_type, msg):
             ordered_ins.append(copy.deepcopy(input_syms[in_name]))
             remain_locs.append(subg_input_names.index(in_name))
 
-    num_outputs = len(flat_out)
-    num_states = len(state_names)
     ret = symbol._internal._foreach(g, *ordered_ins, num_outputs=num_outputs,
                                     num_out_data=num_out_data, in_state_locs=in_state_locs,
                                     in_data_locs=in_data_locs, remain_locs=remain_locs)
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 09831fcaef58..2f19c78bdc10 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5984,11 +5984,6 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
             arg_grad_dict['v'+str(i)] = arr_grad
             i = i + 1
 
-        gin_order = []
-        for name in out.list_inputs():
-            name = name[1:]
-            gin_order.append(int(name))
-
         if is_train:
             e = out.bind(ctx=default_context(), args=arg_dict, args_grad=arg_grad_dict)
         else:
@@ -6048,9 +6043,10 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
             all_ins = _as_list(in_arrs)[:]
             all_ins.extend(init_states)
             all_ins.extend(frees)
-            for i in range(len(all_ins)):
+            size = min(len(all_ins), len(e.grad_arrays))
+            for i in range(size):
                 assert_almost_equal(all_ins[i].grad.asnumpy(),
-                        e.grad_arrays[gin_order[i]].asnumpy(),
+                        e.grad_arrays[i].asnumpy(),
                         rtol=0.001, atol=0.0001)
 
     # Test cases:
@@ -6186,7 +6182,6 @@ def step7(in1, states, free):
     verify_foreach(step7, [v3, v4], [v5, v6], [v7, v8], arrs, states, frees, out_grads, False)
 
     # Test the case that the output is the input.
-    # The output is one of the inputs.
     arrs = mx.nd.random.uniform(shape=(3, 2))
     states = [mx.nd.arange(2)]
     frees = [mx.nd.random.uniform(shape=(2))]
@@ -6201,6 +6196,34 @@ def step9(in1, states, free):
     verify_foreach(step9, v3, [v4], [v5], arrs, states, frees, out_grads)
     verify_foreach(step9, v3, [v4], [v5], arrs, states, frees, out_grads, False)
 
+    # test without free variables.
+    def step13(in1, states, free):
+        return (in1, states)
+    verify_foreach(step13, v3, [v4], [], arrs, states, frees, out_grads)
+    verify_foreach(step13, v3, [v4], [], arrs, states, frees, out_grads, False)
+
+    # Test the case that not all inputs are used.
+    def step10(in1, states, free):
+        return (in1, states)
+    verify_foreach(step10, v3, [v4], [v5], arrs, states, frees, out_grads)
+    verify_foreach(step10, v3, [v4], [v5], arrs, states, frees, out_grads, False)
+    def step11(in1, states, free):
+        return (in1, free)
+    try:
+        verify_foreach(step11, v3, [v4], [v5], arrs, states, frees, out_grads)
+        verify_foreach(step11, v3, [v4], [v5], arrs, states, frees, out_grads, False)
+    except AssertionError:
+        print("the states have to be used")
+    def step12(in1, states, free):
+        return (in1, [states[0] + 1, states[0] + 2])
+    states = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2))]
+    frees = []
+    try:
+        verify_foreach(step12, v3, [v4, v5], [], arrs, states, frees, out_grads)
+        verify_foreach(step12, v3, [v4, v5], [], arrs, states, frees, out_grads, False)
+    except AssertionError:
+        print("the states have to be used")
+
 
 @with_seed()
 def test_foreach_nested():

From 0e6df9aa996e0bcef4d0c02aa33f81d3109e011c Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Mon, 25 Jun 2018 00:32:12 +0000
Subject: [PATCH 122/135] handle the case without output data.

---
 python/mxnet/ndarray/contrib.py | 2 +-
 python/mxnet/symbol/contrib.py  | 4 +++-
 src/operator/control_flow.cc    | 5 +++--
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/python/mxnet/ndarray/contrib.py b/python/mxnet/ndarray/contrib.py
index 3f8f20a73919..b1f065e9f822 100644
--- a/python/mxnet/ndarray/contrib.py
+++ b/python/mxnet/ndarray/contrib.py
@@ -188,6 +188,6 @@ def check_input(inputs, in_type, msg):
         tmp_outputs.append(ndarray.op.stack(*out))
     outputs = tmp_outputs
 
-    if not_data_list:
+    if not_data_list and len(outputs) == 1:
         outputs = outputs[0]
     return (outputs, states)
diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index c871e6c81a4a..d1d5fbfd68cd 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -319,8 +319,10 @@ def check_data(inputs, in_type, msg):
         outs = []
         for i in range(num_outputs - num_states):
             outs.append(ret[i])
-    else:
+    elif num_outputs - num_states == 1:
         outs = ret[0]
+    else:
+        outs = []
     states = []
     for i in range(num_states):
         states.append(ret[num_outputs - num_states + i])
diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index 5b8e4cb6f6e8..0b90b8e31f9b 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -63,6 +63,7 @@ DMLC_REGISTER_PARAMETER(ForeachParam);
 class ForeachState: public LoopState {
  public:
   ForeachParam params;
+  int num_iterations;
 
   ForeachState(const Symbol &g, const ForeachParam &params) : LoopState(g) {
     this->params = params;
@@ -80,6 +81,7 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
   CHECK_EQ(outputs.size(), (size_t) params.num_outputs);
   CHECK_GT(params.in_data_locs.ndim(), 0);
   size_t len = inputs[0].shape()[iter_dim];
+  state.num_iterations = len;
   for (size_t i = 1; i < params.in_data_locs.ndim(); i++)
     CHECK_EQ(inputs[i].shape()[iter_dim], len);
   for (size_t i = 0; i < (size_t) params.num_out_data; i++)
@@ -182,8 +184,7 @@ static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
     CHECK_EQ(arr.storage_type(), kDefaultStorage)
         << "The for operator doesn't support the sparse format";
   size_t iter_dim = 0;
-  // The inputs contain out gradients, inputs and outputs.
-  int len = inputs[0].shape()[iter_dim];
+  int len = state.num_iterations;
   size_t num_output_data = params.num_out_data;
 
   // In backward computation, we need to run iterations from backwards.

From dfadc8d23f9bf87f3d72c3c91aa29b419e36a127 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Mon, 25 Jun 2018 01:56:35 +0000
Subject: [PATCH 123/135] fix a bug in foreach backward.

---
 src/operator/control_flow.cc | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index 0b90b8e31f9b..2d81d3da7244 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -183,16 +183,15 @@ static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
   for (const auto &arr : outputs)
     CHECK_EQ(arr.storage_type(), kDefaultStorage)
         << "The for operator doesn't support the sparse format";
-  size_t iter_dim = 0;
   int len = state.num_iterations;
   size_t num_output_data = params.num_out_data;
 
   // In backward computation, we need to run iterations from backwards.
   std::vector<NDArray> subg_ograds(params.num_outputs);
-  std::vector<NDArray> subg_igrads = outputs;
+  std::vector<NDArray> subg_igrads(outputs.size());
   for (size_t i = num_output_data; i < subg_ograds.size(); i++)
     subg_ograds[i] = inputs[i];
-  std::vector<OpReqType> subg_req;
+  std::vector<OpReqType> subg_req(req.size());
   for (auto r : req)
     CHECK_NE(r, kWriteInplace);
 
@@ -206,17 +205,21 @@ static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
   // [remaining vars]
   for (size_t i = 0; i < params.remain_locs.ndim(); i++) {
     size_t loc = params.remain_locs[i];
-    subg_igrads[loc] = outputs[i + params.in_data_locs.ndim() + params.in_state_locs.ndim()];
+    size_t orig_loc = i + params.in_data_locs.ndim() + params.in_state_locs.ndim();
+    subg_igrads[loc] = outputs[orig_loc];
+    subg_req[loc] = req[orig_loc];
   }
 
   for (int iter_num = len - 1; iter_num >= 0; iter_num--) {
     for (int i = 0; i < params.num_out_data; i++)
       subg_ograds[i] = inputs[i].At(iter_num);
-    if (iter_num == len - 1) {
-      subg_req = req;
-    } else {
-      subg_req.clear();
-      subg_req.resize(req.size(), kAddTo);
+    if (iter_num < len - 1) {
+      // For the rest of the iterations, we should add graidents to the
+      // remaining vars.
+      for (size_t i = 0; i < params.remain_locs.ndim(); i++) {
+        size_t loc = params.remain_locs[i];
+        subg_req[loc] = kAddTo;
+      }
     }
 
     // [data vars]

From 5e9cf5fc088292cbc37719803e0bbcf3e207ab3e Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Mon, 25 Jun 2018 22:21:01 +0000
Subject: [PATCH 124/135] fix a bug when there isn't output data.

---
 src/operator/control_flow.cc           |  4 ++--
 tests/python/unittest/test_operator.py | 25 +++++++++++++++++++------
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index 2d81d3da7244..4d59a4328435 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -101,14 +101,14 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
   // of outputs. In this way, we don't need to copy the results from the
   // subgraph to the final outputs of the loop.
   if (len % 2 == 1) {
-    for (size_t i = 1; i < subg_outputs1.size(); i++) {
+    for (size_t i = params.num_out_data; i < subg_outputs1.size(); i++) {
       subg_outputs1[i] = outputs[i];
       subg_outputs2[i] = NDArray(outputs[i].shape(), outputs[i].ctx(), true,
                                  outputs[i].dtype());
     }
   } else {
     // Otherwise, we'll use the second set of outputs.
-    for (size_t i = 1; i < subg_outputs1.size(); i++) {
+    for (size_t i = params.num_out_data; i < subg_outputs1.size(); i++) {
       subg_outputs1[i] = NDArray(outputs[i].shape(), outputs[i].ctx(), true,
                                  outputs[i].dtype());
       subg_outputs2[i] = outputs[i];
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 2f19c78bdc10..50c9560376a7 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -6196,12 +6196,6 @@ def step9(in1, states, free):
     verify_foreach(step9, v3, [v4], [v5], arrs, states, frees, out_grads)
     verify_foreach(step9, v3, [v4], [v5], arrs, states, frees, out_grads, False)
 
-    # test without free variables.
-    def step13(in1, states, free):
-        return (in1, states)
-    verify_foreach(step13, v3, [v4], [], arrs, states, frees, out_grads)
-    verify_foreach(step13, v3, [v4], [], arrs, states, frees, out_grads, False)
-
     # Test the case that not all inputs are used.
     def step10(in1, states, free):
         return (in1, states)
@@ -6224,6 +6218,25 @@ def step12(in1, states, free):
     except AssertionError:
         print("the states have to be used")
 
+    # test without free variables.
+    def step13(in1, states, free):
+        return (in1, states)
+    states = [mx.nd.random.uniform(shape=(2))]
+    verify_foreach(step13, v3, [v4], [], arrs, states, [], out_grads)
+    verify_foreach(step13, v3, [v4], [], arrs, states, [], out_grads, False)
+
+    # test when there isn't output data or output states.
+    def step14(in1, states, free):
+        return (in1 * free[0], [])
+    frees = [mx.nd.random.uniform(shape=(2))]
+    verify_foreach(step14, v3, [], [v4], arrs, [], frees, out_grads)
+    verify_foreach(step14, v3, [], [v4], arrs, [], frees, out_grads, False)
+    def step15(in1, states, free):
+        return ([], [in1 * states[0] * free[0]])
+    out_grads = [[], [mx.nd.random.uniform(-10, 10, states[0].shape)]]
+    verify_foreach(step15, v3, [v4], [v5], arrs, states, frees, out_grads)
+    verify_foreach(step15, v3, [v4], [v5], arrs, states, frees, out_grads, False)
+
 
 @with_seed()
 def test_foreach_nested():

From 696f53c8ffe5208983e1883ac5d03a936e0ab86d Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 26 Jun 2018 00:26:25 +0000
Subject: [PATCH 125/135] Fix lint error.

---
 python/mxnet/symbol/contrib.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index d1d5fbfd68cd..a252c9bcdf58 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -127,7 +127,7 @@ def _cut_subgraph(subg):
 # This construct a subgraph for given output nodes.
 # If an output node is one of the input nodes, we call identity to make sure
 # that outputs nodes are different from input nodes.
-def construct_subgraph(sym_out, sym_states):
+def _construct_subgraph(sym_out, sym_states):
     sym_out = _as_list(sym_out)
     sym_states = _as_list(sym_states)
     all_outputs = []
@@ -252,7 +252,7 @@ def check_data(inputs, in_type, msg):
         num_out_data = len(sym_out)
         num_states = len(sym_states)
         num_outputs = num_out_data + num_states
-        g = construct_subgraph(sym_out, sym_states)
+        g = _construct_subgraph(sym_out, sym_states)
 
     input_syms = _get_graph_inputs(g)
     cut_syms = _cut_subgraph(g)

From 094977d0c36b3a087f199d5e2dc971461488faad Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 26 Jun 2018 00:42:30 +0000
Subject: [PATCH 126/135] test diff Gluon RNN cells.

---
 tests/python/unittest/test_gluon_rnn.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index cd0ffd9b72be..302928b05f75 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -38,21 +38,21 @@ def test_rnn():
 
 
 class TestRNNLayer(gluon.HybridBlock):
-    def __init__(self, hidden_size, prefix=None, params=None):
+    def __init__(self, cell_type, hidden_size, prefix=None, params=None):
         super(TestRNNLayer, self).__init__(prefix=prefix, params=params)
-        self.cell = gluon.rnn.RNNCell(hidden_size, prefix='rnn_')
+        self.cell = cell_type(hidden_size, prefix='rnn_')
 
     def hybrid_forward(self, F, inputs, states):
-        states = [states]
         out, states = F.contrib.foreach(self.cell, inputs, states)
         return out
 
-def test_contrib_rnn():
+def check_contrib_rnn(cell_type, num_states):
     batch_size = 10
     hidden_size = 100
     rnn_data = mx.nd.normal(loc=0, scale=1, shape=(5, batch_size, 50))
-    states = mx.nd.normal(loc=0, scale=1, shape=(batch_size, hidden_size))
-    layer = TestRNNLayer(hidden_size)
+    state_shape = (batch_size, hidden_size)
+    states = [mx.nd.normal(loc=0, scale=1, shape=state_shape) for i in range(num_states)]
+    layer = TestRNNLayer(cell_type, hidden_size)
     layer.initialize(ctx=mx.cpu(0))
     res1 = layer(rnn_data, states)
     params1 = layer.collect_params()
@@ -64,7 +64,7 @@ def test_contrib_rnn():
     res1.backward()
     trainer.step(batch_size)
 
-    layer = TestRNNLayer(hidden_size)
+    layer = TestRNNLayer(cell_type, hidden_size)
     layer.initialize(ctx=mx.cpu(0))
     layer.hybridize()
     res2 = layer(rnn_data, states)
@@ -85,6 +85,13 @@ def test_contrib_rnn():
         assert_almost_equal(weight1.asnumpy(), weight2.asnumpy(), rtol=0.001, atol=0.0001)
 
 
+def test_contrib_rnn():
+    cell_types = [(gluon.rnn.RNNCell, 1), (gluon.rnn.LSTMCell, 2),
+            (gluon.rnn.GRUCell, 1)]
+    for cell_type, num_states in cell_types:
+        check_contrib_rnn(cell_type, num_states)
+
+
 def test_lstm():
     cell = gluon.rnn.LSTMCell(100, prefix='rnn_')
     inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(3)]

From 7b016ae6d9f4d3291cbe37b243e3e8ced90e65b6 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 26 Jun 2018 01:47:05 +0000
Subject: [PATCH 127/135] test all symbol RNN cells.

---
 tests/python/unittest/test_operator.py | 118 +++++++++----------------
 1 file changed, 44 insertions(+), 74 deletions(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 50c9560376a7..eb49e42c361c 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -19,6 +19,7 @@
 from __future__ import print_function
 import numpy as np
 import mxnet as mx
+import copy
 import math
 import random
 import itertools
@@ -6292,27 +6293,19 @@ def step_nd(in1, states):
     assert_almost_equal(state.grad.asnumpy(), state_grad.asnumpy())
 
 
-@with_seed()
-def test_foreach_lstm():
+def check_foreach_rnn(cell_type, num_states):
     data = mx.sym.var("data")
-    init_h = mx.sym.var("h")
-    init_c = mx.sym.var("c")
-    i2h_weight = mx.sym.var("i2h_weight")
-    h2h_weight = mx.sym.var("h2h_weight")
-    i2h_bias = mx.sym.var("i2h_bias")
-    h2h_bias = mx.sym.var("h2h_bias")
+    params = mx.rnn.RNNParams()
+    hidden_dim = 4
+    input_dim = 5
+    seq_len = 2
+    batch_size = 2
 
     # This tests foreach with accumulation sum.
     def step(in1, states):
-        params = mx.rnn.RNNParams()
-        params._params['i2h_weight'] = i2h_weight
-        params._params['h2h_weight'] = h2h_weight
-        params._params['i2h_bias'] = i2h_bias
-        params._params['h2h_bias'] = h2h_bias
-        lstm = mx.rnn.LSTMCell(4, prefix='mylstm_', params=params)
-        next_h, [next_h, next_c] = lstm(in1, states)
-        # TODO This is problematic. We can't count on the user to define two different symbols.
-        return (next_h, [next_h, next_c])
+        rnn = cell_type(hidden_dim, prefix='', params=params)
+        next_h, states = rnn(in1, states)
+        return (next_h, states)
 
     def sym_group(out):
         if (isinstance(out[0], mx.sym.Symbol)):
@@ -6322,47 +6315,27 @@ def sym_group(out):
         ret.extend(out[1])
         return mx.sym.Group(ret)
 
-    # Inputs
-    data_arr = mx.nd.random.uniform(shape=(2, 2, 4))
-    h_arr = mx.nd.random.uniform(shape=(2, 4))
-    c_arr = mx.nd.random.uniform(shape=(2, 4))
-    i2h_warr = mx.nd.random.uniform(shape=(16, 4))
-    h2h_warr = mx.nd.random.uniform(shape=(16, 4))
-    i2h_barr = mx.nd.random.uniform(shape=(16))
-    h2h_barr = mx.nd.random.uniform(shape=(16))
-    args1 = {'data': data_arr, 'h': h_arr, 'c': c_arr,
-            'i2h_weight': i2h_warr, 'h2h_weight': h2h_warr,
-            'i2h_bias': i2h_barr, 'h2h_bias': h2h_barr}
-    args2 = {'data': data_arr, 'h': h_arr, 'c': c_arr,
-            'i2h_weight': i2h_warr, 'h2h_weight': h2h_warr,
-            'i2h_bias': i2h_barr, 'h2h_bias': h2h_barr}
+    rnn = cell_type(hidden_dim, prefix='', params=params)
+    if num_states == 2:
+        init_states = [mx.sym.var("h"), mx.sym.var("c")]
+    else:
+        init_states = [mx.sym.var("h")]
+    out = mx.sym.contrib.foreach(step, data, init_states)
+    out = sym_group(out)
+    arg_shapes, out_shapes, aux_shapes = out.infer_shape(data=(seq_len, batch_size, input_dim),
+            h=(batch_size, hidden_dim))
+    rnn_inputs = out.list_inputs()
 
+    # Inputs
+    args1 = {name:mx.nd.random.uniform(shape=arg_shapes[i]) for i, name in enumerate(rnn_inputs)}
+    args2 = copy.deepcopy(args1)
     # gradients for the backward of the foreach symbol
-    data_arr_grad1 = mx.nd.empty(data_arr.shape)
-    h_arr_grad1 = mx.nd.empty(h_arr.shape)
-    c_arr_grad1 = mx.nd.empty(c_arr.shape)
-    i2h_warr_grad1 = mx.nd.empty(i2h_warr.shape)
-    h2h_warr_grad1 = mx.nd.empty(h2h_warr.shape)
-    i2h_barr_grad1 = mx.nd.empty(i2h_barr.shape)
-    h2h_barr_grad1 = mx.nd.empty(h2h_barr.shape)
-    args_grad1 = {'data': data_arr_grad1, 'h': h_arr_grad1, 'c': c_arr_grad1,
-            'i2h_weight': i2h_warr_grad1, 'h2h_weight': h2h_warr_grad1,
-            'i2h_bias': i2h_barr_grad1, 'h2h_bias': h2h_barr_grad1}
-
+    args_grad1 = {name:mx.nd.empty(shape=arg_shapes[i]) for i, name in enumerate(rnn_inputs)}
     # gradients for the backward of the unrolled symbol.
-    data_arr_grad2 = mx.nd.empty(data_arr.shape)
-    h_arr_grad2 = mx.nd.empty(h_arr.shape)
-    c_arr_grad2 = mx.nd.empty(c_arr.shape)
-    i2h_warr_grad2 = mx.nd.empty(i2h_warr.shape)
-    h2h_warr_grad2 = mx.nd.empty(h2h_warr.shape)
-    i2h_barr_grad2 = mx.nd.empty(i2h_barr.shape)
-    h2h_barr_grad2 = mx.nd.empty(h2h_barr.shape)
-    args_grad2 = {'data': data_arr_grad2, 'h': h_arr_grad2, 'c': c_arr_grad2,
-            'i2h_weight': i2h_warr_grad2, 'h2h_weight': h2h_warr_grad2,
-            'i2h_bias': i2h_barr_grad2, 'h2h_bias': h2h_barr_grad2}
+    args_grad2 = {name:mx.nd.empty(shape=arg_shapes[i]) for i, name in enumerate(rnn_inputs)}
 
     # Symbol of running LSTM with foreach.
-    out = mx.sym.contrib.foreach(step, data, [init_h, init_c])
+    out = mx.sym.contrib.foreach(step, data, init_states)
     out = sym_group(out)
     js_1 = out.tojson()
     out = mx.sym.load_json(js_1)
@@ -6371,15 +6344,15 @@ def sym_group(out):
     e1 = out.bind(ctx=default_context(), args=args1, args_grad=args_grad1)
 
     # Symbol of running unrolled LSTM.
-    lstm = mx.rnn.LSTMCell(4, prefix='')
-    h = init_h
-    c = init_c
+    lstm = cell_type(hidden_dim, prefix='')
     unroll_outs = []
-    for inputs in mx.sym.split(data, num_outputs=data_arr.shape[0], axis=0, squeeze_axis=True):
-        h, [h, c] = lstm(inputs, [h, c])
+    states = init_states
+    for inputs in mx.sym.split(data, num_outputs=seq_len, axis=0, squeeze_axis=True):
+        h, states = lstm(inputs, states)
         unroll_outs.append(mx.sym.expand_dims(h, axis=0))
-    unroll_outs = mx.sym.concat(*unroll_outs, dim=0)
-    out = mx.sym.Group([unroll_outs, h, c])
+    unroll_outs = _as_list(mx.sym.concat(*unroll_outs, dim=0))
+    unroll_outs.extend(states)
+    out = mx.sym.Group(unroll_outs)
     js_1 = out.tojson()
     out = mx.sym.load_json(js_1)
     js_2 = out.tojson()
@@ -6391,23 +6364,13 @@ def sym_group(out):
         for arr in e1.outputs:
             out_grads.append(mx.nd.random.uniform(-10, 10, arr.shape))
 
-        data_arr = mx.nd.random.uniform(shape=(2, 2, 4))
-        h_arr = mx.nd.random.uniform(shape=(2, 4))
-        c_arr = mx.nd.random.uniform(shape=(2, 4))
-        i2h_warr = mx.nd.random.uniform(shape=(16, 4))
-        h2h_warr = mx.nd.random.uniform(shape=(16, 4))
-        i2h_barr = mx.nd.random.uniform(shape=(16))
-        h2h_barr = mx.nd.random.uniform(shape=(16))
-
-        e1.forward(is_train=True, data = data_arr, h = h_arr, c = c_arr,
-            i2h_weight = i2h_warr, h2h_weight = h2h_warr,
-            i2h_bias = i2h_barr, h2h_bias = h2h_barr)
+        args = {name:mx.nd.random.uniform(shape=arg_shapes[i]) for i, name in enumerate(rnn_inputs)}
+
+        e1.forward(is_train=True, **args)
         outputs1 = e1.outputs
         e1.backward(out_grads)
 
-        e2.forward(is_train=True, data = data_arr, h = h_arr, c = c_arr,
-            i2h_weight = i2h_warr, h2h_weight = h2h_warr,
-            i2h_bias = i2h_barr, h2h_bias = h2h_barr)
+        e2.forward(is_train=True, **args)
         outputs2 = e2.outputs
         e2.backward(out_grads)
 
@@ -6420,6 +6383,13 @@ def sym_group(out):
             assert_almost_equal(args_grad1[name].asnumpy(), args_grad2[name].asnumpy())
 
 
+@with_seed()
+def test_foreach_rnn():
+    cell_types = [(mx.rnn.LSTMCell, 2), (mx.rnn.RNNCell, 1), (mx.rnn.GRUCell, 1)]
+    for cell_type, num_states in cell_types:
+        check_foreach_rnn(cell_type, num_states)
+
+
 @with_seed()
 def test_squeeze_op():
     def check_squeeze_op(shape, axis=None):

From 9609ce87280cc47887e0f464ffe00dbc7153787d Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 26 Jun 2018 00:38:28 -0700
Subject: [PATCH 128/135] adjust the test precision.

---
 tests/python/unittest/test_operator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index eb49e42c361c..7d9a36d83d7f 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -6380,7 +6380,8 @@ def sym_group(out):
         input_names = out.list_inputs()
         for i in range(len(e1.grad_arrays)):
             name = input_names[i]
-            assert_almost_equal(args_grad1[name].asnumpy(), args_grad2[name].asnumpy())
+            assert_almost_equal(args_grad1[name].asnumpy(), args_grad2[name].asnumpy(),
+                    rtol=0.001, atol=0.0001)
 
 
 @with_seed()

From fa8abbdb0e8f4af606844894a22cd45d71ebc0f7 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 26 Jun 2018 18:27:10 +0000
Subject: [PATCH 129/135] Fix a bug in getting a list of variable names.

We can't get a list of variable names from a hashtable. The order can't
be guaranteed. Python2 and Python3 output different orders.
---
 python/mxnet/symbol/contrib.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index a252c9bcdf58..b3104c4a17ec 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -266,11 +266,12 @@ def check_data(inputs, in_type, msg):
     gin_names = input_syms.keys()
     # This array contains the symbols for the inputs of foreach.
     # They are ordered according to the inputs of the subgraph.
+    init_states = _as_list(init_states)
     states_map = {sym.name:sym for sym in init_states}
-    state_names = states_map.keys()
+    state_names = [sym.name for sym in init_states]
     data_syms = _as_list(data)
     data_map = {sym.name:sym for sym in data_syms}
-    data_names = data_map.keys()
+    data_names = [sym.name for sym in data_syms]
     cut_var_map = {sym.list_outputs()[0]:sym for sym in cut_syms}
     cut_var_names = cut_var_map.keys()
 

From 8e74d801903b60425ca0d6dc5d0942c84cb6ce3c Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 26 Jun 2018 18:33:09 +0000
Subject: [PATCH 130/135] fix lint error.

---
 python/mxnet/symbol/contrib.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index b3104c4a17ec..4c03753e183d 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -267,10 +267,8 @@ def check_data(inputs, in_type, msg):
     # This array contains the symbols for the inputs of foreach.
     # They are ordered according to the inputs of the subgraph.
     init_states = _as_list(init_states)
-    states_map = {sym.name:sym for sym in init_states}
     state_names = [sym.name for sym in init_states]
     data_syms = _as_list(data)
-    data_map = {sym.name:sym for sym in data_syms}
     data_names = [sym.name for sym in data_syms]
     cut_var_map = {sym.list_outputs()[0]:sym for sym in cut_syms}
     cut_var_names = cut_var_map.keys()

From 24391050a0c9fc523db039f9f5b3bf5200c25f61 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 27 Jun 2018 19:21:22 +0000
Subject: [PATCH 131/135] Test 1D array.

---
 src/operator/control_flow.cc           | 33 ++++++++++++++++++++------
 tests/python/unittest/test_operator.py | 18 ++++++++++++++
 2 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index 4d59a4328435..767115b37703 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -266,6 +266,16 @@ static void remap(const std::vector<T> &op_in, size_t start,
   }
 }
 
+static inline TShape SliceFirstDim(const TShape &s) {
+  if (s.ndim() > 1) {
+    return TShape(s.begin() + 1, s.end());
+  } else {
+    TShape s(1);
+    s[0] = 1;
+    return s;
+  }
+}
+
 static bool ForeachShape(const nnvm::NodeAttrs& attrs,
                          std::vector<TShape> *in_shape,
                          std::vector<TShape> *out_shape) {
@@ -275,9 +285,12 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
 
   std::vector<TShape> subg_in_shape(in_shape->size());
   // data shape
+  std::vector<bool> data_1d(params.in_data_locs.ndim(), false);
   for (size_t i = 0; i < params.in_data_locs.ndim(); i++) {
     size_t loc = params.in_data_locs[i];
-    subg_in_shape[loc] = TShape(in_shape->at(i).begin() + 1, in_shape->at(i).end());
+    if (in_shape->at(i).ndim() == 1)
+      data_1d[i] = true;
+    subg_in_shape[loc] = SliceFirstDim(in_shape->at(i));
   }
   // state shape
   remap(*in_shape, params.in_data_locs.ndim(), params.in_state_locs,
@@ -292,7 +305,7 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
     // If we don't have shape info, we don't need to do anything.
     if (shape.ndim() == 0)
       continue;
-    subg_out_shape[i] = TShape(shape.begin() + 1, shape.end());
+    subg_out_shape[i] = SliceFirstDim(shape);
   }
 
   bool infer_success = InferSubgraphShape(*attrs.subgraphs[0],
@@ -329,11 +342,17 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
     if (shape.ndim() == 0)
       continue;
 
-    auto in = TShape(shape.ndim() + 1);
-    in[0] = len;
-    for (size_t i = 1; i < in.ndim(); i++)
-      in[i] = shape[i - 1];
-    SHAPE_ASSIGN_CHECK(*in_shape, i, in);
+    if (data_1d[i]) {
+      TShape s(1);
+      s[0] = len;
+      SHAPE_ASSIGN_CHECK(*in_shape, i, s);
+    } else {
+      auto in = TShape(shape.ndim() + 1);
+      in[0] = len;
+      for (size_t i = 1; i < in.ndim(); i++)
+        in[i] = shape[i - 1];
+      SHAPE_ASSIGN_CHECK(*in_shape, i, in);
+    }
   }
   // For the shape of state.
   for (size_t i = 0; i < params.in_state_locs.ndim(); i++) {
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 7d9a36d83d7f..cd5e652fc99c 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -6238,6 +6238,24 @@ def step15(in1, states, free):
     verify_foreach(step15, v3, [v4], [v5], arrs, states, frees, out_grads)
     verify_foreach(step15, v3, [v4], [v5], arrs, states, frees, out_grads, False)
 
+    # Test the case of iterating on a 1D data array.
+    def step16(in1, states, free):
+        return ([in1[0] * states[0]], [states[0] * 2])
+    arrs = [mx.nd.arange(3)]
+    states = [mx.nd.random.uniform(shape=(1))]
+    out_grads = [[mx.nd.random.uniform(-10, 10, (3, 1))],
+            [mx.nd.random.uniform(-10, 10, (1))]]
+    verify_foreach(step16, [v3], [v4], [], arrs, states, [], out_grads)
+    verify_foreach(step16, [v3], [v4], [], arrs, states, [], out_grads, False)
+    def step17(in1, states, free):
+        return ([in1[1] * in1[0] * states[0]], [states[0] * 2])
+    arrs = [mx.nd.random.uniform(shape=(3, 1)), mx.nd.arange(3)]
+    states = [mx.nd.random.uniform(shape=(1))]
+    out_grads = [[mx.nd.random.uniform(-10, 10, (3, 1))],
+            [mx.nd.random.uniform(-10, 10, (1))]]
+    verify_foreach(step17, [v3, v4], [v5], [], arrs, states, [], out_grads)
+    verify_foreach(step17, [v3, v4], [v5], [], arrs, states, [], out_grads, False)
+
 
 @with_seed()
 def test_foreach_nested():

From 53cdbfafcf98b801467b33b1a573f9762f34237e Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Wed, 27 Jun 2018 22:07:11 +0000
Subject: [PATCH 132/135] fix a bug when subgraph inputs and outputs share
 NDArray.

---
 src/operator/subgraph_op_common.cc     | 30 +++++++++++++++++++-------
 src/operator/subgraph_op_common.h      |  8 +++----
 tests/python/unittest/test_operator.py |  2 +-
 3 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/src/operator/subgraph_op_common.cc b/src/operator/subgraph_op_common.cc
index 55da55cd28d7..71a9a21c28c4 100644
--- a/src/operator/subgraph_op_common.cc
+++ b/src/operator/subgraph_op_common.cc
@@ -175,9 +175,9 @@ LoopState::LoopState(const Symbol &g) {
 }
 
 void LoopState::Forward(int iter_no,
-                        std::vector<NDArray> cinputs,
+                        const std::vector<NDArray> &cinputs,
                         const std::vector<OpReqType>& req,
-                        std::vector<NDArray> coutputs,
+                        const std::vector<NDArray> &coutputs,
                         bool is_recording) {
   using namespace nnvm;
   using namespace imperative;
@@ -188,14 +188,21 @@ void LoopState::Forward(int iter_no,
   else
     orig_is_record = Imperative::Get()->is_recording();
 
+  std::vector<NDArray> in_bufs = cinputs;
+  std::vector<NDArray> out_bufs = coutputs;
   std::vector<NDArray *> inputs(cinputs.size());
   std::vector<NDArray *> outputs(coutputs.size());
   for (size_t i = 0; i < inputs.size(); i++)
-    inputs[i] = &cinputs[i];
+    inputs[i] = &in_bufs[i];
   for (size_t i = 0; i < outputs.size(); i++)
-    outputs[i] = &coutputs[i];
+    outputs[i] = &out_bufs[i];
 
   OpStatePtr state = iter_op->Forward(nullptr, inputs, outputs);
+  // If an input and an output share the array, the output array will be changed
+  // by CachedOp. We need to copy data to the real output.
+  for (size_t i = 0; i < out_bufs.size(); i++)
+    if (!out_bufs[i].IsSame(coutputs[i]))
+      CopyFromTo(out_bufs[i], coutputs[i]);
   if (is_recording) {
     all_inputs.push_back(cinputs);
     all_outputs.push_back(coutputs);
@@ -206,9 +213,9 @@ void LoopState::Forward(int iter_no,
 }
 
 void LoopState::Backward(int iter_no,
-                         std::vector<NDArray> ograds,
+                         const std::vector<NDArray> &ograds,
                          const std::vector<OpReqType> &req,
-                         std::vector<NDArray> igrads) {
+                         const std::vector<NDArray> &igrads) {
   using namespace nnvm;
   using namespace imperative;
 
@@ -219,8 +226,10 @@ void LoopState::Backward(int iter_no,
   std::vector<NDArray *> outputs;
   inputs.reserve(op->num_backward_inputs());
   outputs.reserve(op->num_inputs());
+  std::vector<NDArray> ograd_bufs = ograds;
+  std::vector<NDArray> igrad_bufs = igrads;
   for (size_t i = 0; i < ograds.size(); i++)
-    inputs.push_back(&ograds[i]);
+    inputs.push_back(&ograd_bufs[i]);
 
   const std::vector<bool> &save_inputs = op->save_inputs();
   const std::vector<bool> &save_outputs = op->save_outputs();
@@ -236,10 +245,15 @@ void LoopState::Backward(int iter_no,
   }
   CHECK_EQ(inputs.size(), op->num_backward_inputs());
   for (size_t i = 0; i < igrads.size(); i++)
-    outputs.push_back(&igrads[i]);
+    outputs.push_back(&igrad_bufs[i]);
   CHECK_EQ(outputs.size(), op->num_inputs());
   auto state = all_states[iter_no];
   op->Backward(false, state, inputs, req, outputs);
+  // If an input and an output share the array, the output array will be changed
+  // by CachedOp. We need to copy data to the real output.
+  for (size_t i = 0; i < igrads.size(); i++)
+    if (!igrads[i].IsSame(igrad_bufs[i]))
+      CopyFromTo(igrad_bufs[i], igrads[i]);
 }
 
 }  // namespace op
diff --git a/src/operator/subgraph_op_common.h b/src/operator/subgraph_op_common.h
index a2706bebc0d3..79078409e214 100644
--- a/src/operator/subgraph_op_common.h
+++ b/src/operator/subgraph_op_common.h
@@ -78,14 +78,14 @@ class LoopState {
   explicit LoopState(const Symbol &g);
 
   void Forward(int iter_no,
-               std::vector<NDArray> cinputs,
+               const std::vector<NDArray> &inputs,
                const std::vector<OpReqType>& req,
-               std::vector<NDArray> coutputs,
+               const std::vector<NDArray> &outputs,
                bool is_recording);
   void Backward(int iter_no,
-                std::vector<NDArray> ograds,
+                const std::vector<NDArray> &ograds,
                 const std::vector<OpReqType> &req,
-                std::vector<NDArray> igrads);
+                const std::vector<NDArray> &igrads);
   void Cleanup() {
     all_outputs.clear();
     all_inputs.clear();
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index cd5e652fc99c..ae5cba21711a 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -6228,7 +6228,7 @@ def step13(in1, states, free):
 
     # test when there isn't output data or output states.
     def step14(in1, states, free):
-        return (in1 * free[0], [])
+        return (in1 + free[0], [])
     frees = [mx.nd.random.uniform(shape=(2))]
     verify_foreach(step14, v3, [], [v4], arrs, [], frees, out_grads)
     verify_foreach(step14, v3, [], [v4], arrs, [], frees, out_grads, False)

From 9bff317af52be75dd46506bcf2e6fa1bda1e3abb Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Thu, 28 Jun 2018 06:47:08 +0000
Subject: [PATCH 133/135] fix.

---
 src/operator/control_flow.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index 767115b37703..f18c8b828a2f 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -270,9 +270,9 @@ static inline TShape SliceFirstDim(const TShape &s) {
   if (s.ndim() > 1) {
     return TShape(s.begin() + 1, s.end());
   } else {
-    TShape s(1);
-    s[0] = 1;
-    return s;
+    TShape ret(1);
+    ret[0] = 1;
+    return ret;
   }
 }
 

From d3687ef0cdc90a559f7fc135bd81ff39ccc981ac Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Thu, 28 Jun 2018 06:56:15 +0000
Subject: [PATCH 134/135] fix

---
 src/operator/control_flow.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index f18c8b828a2f..c091fdb67e0f 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -270,9 +270,7 @@ static inline TShape SliceFirstDim(const TShape &s) {
   if (s.ndim() > 1) {
     return TShape(s.begin() + 1, s.end());
   } else {
-    TShape ret(1);
-    ret[0] = 1;
-    return ret;
+    return TShape(mshadow::Shape1(1));
   }
 }
 

From 392a7e4c1f03ab3ba91196191fe18b21b5891f36 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Mon, 2 Jul 2018 18:37:42 +0000
Subject: [PATCH 135/135] add comments.

---
 python/mxnet/symbol/contrib.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 4c03753e183d..28bb507dd13d 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -173,6 +173,10 @@ def foreach(body, data, init_states, name="foreach"):
     as the first output of foreach; states from the last execution of body
     are the second output of foreach.
 
+    foreach can output only output data or states. If a user only wants states,
+    the body function can return ([], states). Similarly, if a user only wants
+    output data, the body function can return (out, []).
+
     The computation done by this operator is equivalent to the pseudo code below
     when the input data is NDArray: