diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index f6122c1f37b6..c954cabadac0 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -483,6 +483,13 @@ MXNET_DLL int MXSymbolInferShape(SymbolHandle sym,
 //--------------------------------------------
 // Part 4: Executor interface
 //--------------------------------------------
+/*!
+ * \brief Print the content of execution plan, used for debug.
+ * \param handle the executor.
+ * \param out_str pointer to hold the output string of the printing.
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXExecutorPrint(ExecutorHandle symbol, const char **out_str);
 /*!
  * \brief Executor forward method
  *
diff --git a/include/mxnet/resource.h b/include/mxnet/resource.h
index a6c61f6f8862..d85299960511 100644
--- a/include/mxnet/resource.h
+++ b/include/mxnet/resource.h
@@ -20,21 +20,19 @@ struct ResourceRequest {
   enum Type {
     /*! \brief mshadow::Random<xpu> object */
     kRandom,
-    /*! \brief Temporal space */
+    /*! \brief A dynamic temp space that can be arbitrary size */
     kTempSpace
   };
   /*! \brief type of resources */
   Type type;
-  /*! \brief size of space requested, in terms of number of reals */
-  size_t space_num_reals;
   /*! \brief default constructor */
   ResourceRequest() {}
   /*!
    * \brief constructor, allow implicit conversion
    * \param type type of resources
    */
-  ResourceRequest(Type type, size_t space_num_reals = 0)  // NOLINT(*)
-      : type(type), space_num_reals(space_num_reals) {}
+  ResourceRequest(Type type)  // NOLINT(*)
+      : type(type) {}
 };
 
 
@@ -48,11 +46,15 @@ struct Resource {
   ResourceRequest req;
   /*! \brief engine variable */
   engine::VarHandle var;
+  /*! \brief identifier of id information, used for debug purpose */
+  int32_t id;
   /*!
    * \brief pointer to the resource, do not use directly,
    *  access using member functions
    */
   void *ptr_;
+  /*! \brief default constructor */
+  Resource() : id(0) {}
   /*!
    * \brief Get random number generator.
    * \param The stream to use in the random number generator.
@@ -70,7 +72,8 @@ struct Resource {
   }
   /*!
    * \brief Get space requested as mshadow Tensor.
-   *  The resulting tensor must fit in space requsted.
+   *  The caller can request arbitrary size.
+   *
    * \param shape the Shape of returning tensor.
    * \param stream the stream of retruning tensor.
    * \return the mshadow tensor requested.
@@ -81,9 +84,11 @@ struct Resource {
   inline mshadow::Tensor<xpu, ndim, real_t> get_space(
       mshadow::Shape<ndim> shape, mshadow::Stream<xpu> *stream) const {
     CHECK_EQ(req.type, ResourceRequest::kTempSpace);
-    CHECK_GE(req.space_num_reals, shape.Size());
+    mshadow::TensorContainer<xpu, 1, real_t> *space =
+        static_cast<mshadow::TensorContainer<xpu, 1, real_t>*>(ptr_);
+    space->Resize(mshadow::Shape1(shape.Size()));
     return mshadow::Tensor<xpu, ndim, real_t>(
-        static_cast<real_t*>(ptr_), shape, shape[ndim - 1], stream);
+        space->dptr_, shape, shape[ndim - 1], stream);
   }
 };
 
@@ -97,7 +102,6 @@ class ResourceManager {
    * \return the requested resource.
    * \note The returned resource's ownership is
    *       still hold by the manager singleton.
-   *
    */
   virtual Resource Request(Context ctx, const ResourceRequest &req) = 0;
   /*!
diff --git a/include/mxnet/symbolic.h b/include/mxnet/symbolic.h
index 9f1f21d69d59..ad01040007a7 100644
--- a/include/mxnet/symbolic.h
+++ b/include/mxnet/symbolic.h
@@ -400,6 +400,11 @@ class Executor {
    * \param head_grads the gradient of head nodes to be backproped.
    */
   virtual void Backward(const std::vector<NDArray> &head_grads) = 0;
+  /*!
+   * \brief print the execution plan info to output stream.
+   * \param os the output stream we like to print to.
+   */
+  virtual void Print(std::ostream &os) const {} // NOLINT(*)
   /*!
    * \brief get array of outputs in the executor.
    * \return array of outputs in the executor.
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index f57077adc919..27511f5ff38e 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -5,8 +5,8 @@
 
 import ctypes
 from .base import _LIB
-from .base import c_array, mx_uint, NDArrayHandle, ExecutorHandle
-from .base import check_call
+from .base import mx_uint, NDArrayHandle, ExecutorHandle
+from .base import check_call, c_array, py_str
 from .ndarray import NDArray
 
 class Executor(object):
@@ -81,6 +81,19 @@ def backward(self, head_grads=None):
         ndarray = c_array(NDArrayHandle, [item.handle for item in head_grads])
         check_call(_LIB.MXExecutorBackward(self.handle, len(head_grads), ndarray))
 
+    def debug_str(self):
+        """Get a debug string about internal execution plan.
+
+        Returns
+        -------
+        debug_str : string
+            Debug string of the executor.
+        """
+        debug_str = ctypes.c_char_p()
+        check_call(_LIB.MXExecutorPrint(
+            self.handle, ctypes.byref(debug_str)))
+        return py_str(debug_str.value)
+
     @property
     def outputs(self):
         """list all heads' output ndarray
diff --git a/src/c_api.cc b/src/c_api.cc
index 222440f76855..c453f743aba3 100644
--- a/src/c_api.cc
+++ b/src/c_api.cc
@@ -684,6 +684,17 @@ int MXSymbolInferShape(SymbolHandle sym,
   API_END();
 }
 
+int MXExecutorPrint(ExecutorHandle handle, const char **out_str) {
+  Executor *exec = static_cast<Executor*>(handle);
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  API_BEGIN();
+  std::ostringstream os;
+  exec->Print(os);
+  ret->ret_str = os.str();
+  *out_str = (ret->ret_str).c_str();
+  API_END();
+}
+
 int MXExecutorForward(ExecutorHandle handle, bool is_train) {
   API_BEGIN();
   Executor *exec = static_cast<Executor*>(handle);
diff --git a/src/engine/stream_manager.h b/src/engine/stream_manager.h
index adc84cdf7e9e..b9303e86f08c 100644
--- a/src/engine/stream_manager.h
+++ b/src/engine/stream_manager.h
@@ -75,8 +75,9 @@ template <std::size_t kNumGpus, std::size_t kStreams>
 RunContext StreamManager<kNumGpus, kStreams>::GetIORunContext(
     Context const& ctx) {
   RunContext ret;
+  ret.stream = nullptr;
   switch (ctx.dev_mask) {
-    case cpu::kDevMask: ret.stream = nullptr; break;
+    case cpu::kDevMask: break;
     case gpu::kDevMask: {
 #if MXNET_USE_CUDA
       CUDA_CALL(cudaSetDevice(ctx.dev_id));
diff --git a/src/operator/batch_norm-inl.h b/src/operator/batch_norm-inl.h
index 9ab15d4abe7a..c6bca12bd0db 100644
--- a/src/operator/batch_norm-inl.h
+++ b/src/operator/batch_norm-inl.h
@@ -238,9 +238,7 @@ class BatchNormProp : public OperatorProperty {
 
   std::vector<ResourceRequest> BackwardResource(
       const std::vector<TShape> &in_shape) const override {
-    const TShape &dshape = in_shape[0];
-    size_t nspace = dshape[1] * 3;
-    return {{ResourceRequest::kTempSpace, nspace}};
+    return {ResourceRequest::kTempSpace};
   }
 
   int NumVisibleOutputs() const override {
diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
index 4ce017270a68..f26265f3f3f1 100644
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -348,12 +348,12 @@ class ConvolutionProp : public OperatorProperty {
 
   virtual std::vector<ResourceRequest> ForwardResource(
       const std::vector<TShape> &in_shape) const {
-    return {{ResourceRequest::kTempSpace, param_.workspace}};
+    return {ResourceRequest::kTempSpace};
   }
 
   virtual std::vector<ResourceRequest> BackwardResource(
       const std::vector<TShape> &in_shape) const {
-    return {{ResourceRequest::kTempSpace, param_.workspace}};
+    return {ResourceRequest::kTempSpace};
   }
 
   Operator* CreateOperator(Context ctx) const;
diff --git a/src/resource.cc b/src/resource.cc
index 62d0b0080fc3..e3d1771f6d32 100644
--- a/src/resource.cc
+++ b/src/resource.cc
@@ -4,9 +4,12 @@
  * \brief Implementation of resource manager.
  */
 #include <dmlc/logging.h>
+#include <dmlc/parameter.h>
 #include <mxnet/base.h>
 #include <mxnet/engine.h>
 #include <mxnet/resource.h>
+#include <limits>
+#include <atomic>
 #include "./common/lazy_alloc_array.h"
 
 namespace mxnet {
@@ -15,10 +18,15 @@ namespace resource {
 // implements resource manager
 class ResourceManagerImpl : public ResourceManager {
  public:
-  ResourceManagerImpl() : global_seed_(0) {
+  ResourceManagerImpl() noexcept(false)
+      : global_seed_(0) {
+    cpu_temp_space_copy_ = dmlc::GetEnv("MXNET_CPU_TEMP_COPY", 16);
+    gpu_temp_space_copy_ = dmlc::GetEnv("MXNET_GPU_TEMP_COPY", 4);
     engine_ref_ = Engine::_GetSharedRef();
     cpu_rand_ = new ResourceRandom<cpu>(
         Context(cpu::kDevMask, 0), global_seed_);
+    cpu_space_ = new ResourceTempSpace<cpu>(
+        Context(cpu::kDevMask, 0), cpu_temp_space_copy_);
   }
   ~ResourceManagerImpl() {
     // need explicit delete, before engine get killed
@@ -32,21 +40,31 @@ class ResourceManagerImpl : public ResourceManager {
 
   // request resources
   Resource Request(Context ctx, const ResourceRequest &req) override {
-    if (req.type == ResourceRequest::kRandom) {
-      if (ctx.dev_mask == cpu::kDevMask) {
-        return cpu_rand_->resource;
-      } else {
-        CHECK_EQ(ctx.dev_mask, gpu::kDevMask);
+    if (ctx.dev_mask == cpu::kDevMask) {
+      switch (req.type) {
+        case ResourceRequest::kRandom: return cpu_rand_->resource;
+        case ResourceRequest::kTempSpace: return cpu_space_->GetNext();
+        default: LOG(FATAL) << "Unknown supported type " << req.type;
+      }
+    } else {
+      CHECK_EQ(ctx.dev_mask, gpu::kDevMask);
 #if MSHADOW_USE_CUDA
-        return gpu_rand_.Get(ctx.dev_id, [ctx, this]() {
-            return new ResourceRandom<gpu>(ctx, global_seed_);
-          })->resource;
+      switch (req.type) {
+        case ResourceRequest::kRandom: {
+          return gpu_rand_.Get(ctx.dev_id, [ctx, this]() {
+              return new ResourceRandom<gpu>(ctx, global_seed_);
+            })->resource;
+        }
+        case ResourceRequest::kTempSpace: {
+          return gpu_space_.Get(ctx.dev_id, [ctx, this]() {
+              return new ResourceTempSpace<gpu>(ctx, gpu_temp_space_copy_);
+            })->GetNext();
+        }
+        default: LOG(FATAL) << "Unknown supported type " << req.type;
+      }
 #else
-        LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+      LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
 #endif
-      }
-    } else {
-      LOG(FATAL) << "Unknown supported type " << req.type;
     }
     Resource ret;
     return ret;
@@ -67,16 +85,13 @@ class ResourceManagerImpl : public ResourceManager {
   static constexpr std::size_t kMaxNumGPUs = 16;
   /*! \brief Random number magic number to seed different random numbers */
   static constexpr uint32_t kRandMagic = 127UL;
-  /*! \brief Reference to the engine */
-  std::shared_ptr<Engine> engine_ref_;
-
   // the random number resources
   template<typename xpu>
   struct ResourceRandom {
-    /*! \brief pointer to PRNG */
-    mshadow::Random<xpu> *prnd;
     /*! \brief the context of the PRNG */
     Context ctx;
+    /*! \brief pointer to PRNG */
+    mshadow::Random<xpu> *prnd;
     /*! \brief resource representation */
     Resource resource;
     /*! \brief constructor */
@@ -103,13 +118,65 @@ class ResourceManagerImpl : public ResourceManager {
         }, ctx, {}, {resource.var});
     }
   };
+  // temporal space resource.
+  template<typename xpu>
+  struct ResourceTempSpace {
+    /*! \brief the context of the device */
+    Context ctx;
+    /*! \brief the underlying space */
+    std::vector<mshadow::TensorContainer<xpu, 1, real_t>*> space;
+    /*! \brief resource representation */
+    std::vector<Resource> resource;
+    /*! \brief current pointer to the round roubin alloator */
+    std::atomic<size_t> curr_ptr;
+    /*! \brief constructor */
+    explicit ResourceTempSpace(Context ctx, size_t ncopy)
+        : ctx(ctx), space(ncopy), resource(ncopy), curr_ptr(0) {
+      mshadow::SetDevice<xpu>(ctx.dev_id);
+      for (size_t i = 0; i < space.size(); ++i) {
+        space[i] = new mshadow::TensorContainer<xpu, 1, real_t>();
+        resource[i].var = Engine::Get()->NewVariable();
+        resource[i].id = static_cast<int32_t>(i);
+        resource[i].ptr_ = space[i];
+        resource[i].req = ResourceRequest(ResourceRequest::kTempSpace);
+      }
+    }
+    ~ResourceTempSpace() {
+      for (size_t i = 0; i < space.size(); ++i) {
+        mshadow::TensorContainer<xpu, 1, real_t>* r = space[i];
+        Engine::Get()->DeleteVariable(
+            [r](RunContext rctx){ delete r; }, ctx, resource[i].var);
+      }
+    }
+    // get next resource in round roubin matter
+    inline Resource GetNext() {
+      const size_t kMaxDigit = std::numeric_limits<size_t>::max() / 2;
+      size_t ptr = ++curr_ptr;
+      // reset ptr to avoid undefined behavior during overflow
+      // usually this won't happen
+      if (ptr > kMaxDigit) {
+        curr_ptr.store((ptr + 1) % space.size());
+      }
+      return resource[ptr % space.size()];
+    }
+  };
+  /*! \brief number of copies in CPU temp space */
+  int cpu_temp_space_copy_;
+  /*! \brief number of copies in GPU temp space */
+  int gpu_temp_space_copy_;
+  /*! \brief Reference to the engine */
+  std::shared_ptr<Engine> engine_ref_;
   /*! \brief internal seed to the random number generator */
   uint32_t global_seed_;
   /*! \brief CPU random number resources */
   ResourceRandom<cpu> *cpu_rand_;
+  /*! \brief CPU temp space resources */
+  ResourceTempSpace<cpu> *cpu_space_;
 #if MXNET_USE_CUDA
   /*! \brief random number generator for GPU */
   common::LazyAllocArray<ResourceRandom<gpu> > gpu_rand_;
+  /*! \brief temp space for GPU */
+  common::LazyAllocArray<ResourceTempSpace<gpu> > gpu_space_;
 #endif
 };
 }  // namespace resource
diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
index a30a4c0d897c..82071da15425 100644
--- a/src/symbol/graph_executor.cc
+++ b/src/symbol/graph_executor.cc
@@ -411,21 +411,6 @@ void GraphExecutor::InitDataEntryMemory() {
         out->type = kInternalAllocated;
       }
     }
-    // resource
-    const std::vector<ResourceRequest>& reqs = GetResource(nid);
-    op_nodes_[nid].resources.resize(reqs.size());
-    for (uint32_t i = 0; i < reqs.size(); ++i) {
-      op_nodes_[nid].resources[i].resource.req = reqs[i];
-    }
-    // allocate resource
-    for (ResourceEntry& entry : op_nodes_[nid].resources) {
-      if (entry.resource.req.type == ResourceRequest::kTempSpace) {
-        entry.storage_id =
-            allocator.Request(op_nodes_[nid].ctx,
-                              mshadow::Shape1(entry.resource.req.space_num_reals),
-                              nid);
-      }
-    }
     // then free inputs
     for (DataEntryInfo *in : in_data) {
       // temp_ref_count == 0 means it is taken by inplace op
@@ -445,12 +430,6 @@ void GraphExecutor::InitDataEntryMemory() {
         allocator.Release(out->storage_id, nid);
       }
     }
-    // release the resource, as soon as the forward is finished we can release it.
-    for (ResourceEntry& res : op_nodes_[nid].resources) {
-      if (res.resource.req.type == ResourceRequest::kTempSpace) {
-        allocator.Release(res.storage_id, nid);
-      }
-    }
   }
   // one pass complete, allocate real memory
   this->total_allocated_reals_ = allocator.InitStorages();
@@ -464,25 +443,8 @@ void GraphExecutor::InitDataEntryMemory() {
         out.data = allocator.Get(out.storage_id, out.shape);
       }
     }
-    // Get the resource of temporal space.
-    for (ResourceEntry& entry : op_nodes_[nid].resources) {
-      if (entry.resource.req.type == ResourceRequest::kTempSpace) {
-        entry.data = allocator.Get(entry.storage_id,
-                                   mshadow::Shape1(entry.resource.req.space_num_reals));
-        entry.resource.ptr_ = entry.data.data().dptr_;
-        entry.resource.var = entry.data.var();
-      } else if (entry.resource.req.type == ResourceRequest::kRandom) {
-        entry.resource = ResourceManager::Get()->Request(
-            op_nodes_[nid].ctx, entry.resource.req);
-      } else {
-        LOG(FATAL) << "resource type not yet supported";
-      }
-      op_nodes_[nid].op_ctx.requested.resize(op_nodes_[nid].resources.size());
-      for (size_t i = 0; i < op_nodes_[nid].resources.size(); ++i) {
-        op_nodes_[nid].op_ctx.requested[i] = op_nodes_[nid].resources[i].resource;
-      }
-    }
   }
+  // setup heads
   for (StaticGraph::DataEntry e : graph_.heads) {
     DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
     CHECK_EQ(info.type, kInternalAllocated);
@@ -490,6 +452,32 @@ void GraphExecutor::InitDataEntryMemory() {
   }
 }
 
+void GraphExecutor::InitResources() {
+  // Resource allocation
+  for (size_t i = 0; i < topo_order_.size(); ++i) {
+    uint32_t nid = topo_order_[i];
+    if (!op_nodes_[nid].activated) continue;
+    if (graph_.nodes[nid].is_variable()) continue;
+
+    const std::vector<ResourceRequest>& reqs = GetResource(nid);
+    auto& requested = op_nodes_[nid].op_ctx.requested;
+    requested.clear();
+    // Get the resource of temporal space.
+    for (const ResourceRequest& req : reqs) {
+      if (req.type == ResourceRequest::kTempSpace) {
+        // TODO(tqchen, bing) more smarter graph aware temp sapce allocation.
+        requested.push_back(ResourceManager::Get()->Request(
+            op_nodes_[nid].ctx, req));
+      } else if (req.type == ResourceRequest::kRandom) {
+        requested.push_back(ResourceManager::Get()->Request(
+            op_nodes_[nid].ctx, req));
+      } else {
+        LOG(FATAL) << "resource type not yet supported";
+      }
+    }
+  }
+}
+
 void GraphExecutor::InitOpNodes() {
   for (size_t i = 0; i < topo_order_.size(); ++i) {
     uint32_t nid = topo_order_[i];
@@ -544,8 +532,7 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
   }
 }
 
-std::string GraphExecutor::DebugStr() const {
-  std::ostringstream os;
+void GraphExecutor::Print(std::ostream &os) const {
   os << "num_forward_nodes=" << num_forward_nodes_ << '\n';
   for (size_t i = 0; i < topo_order_.size(); ++i) {
     uint32_t nid = topo_order_[i];
@@ -562,20 +549,18 @@ std::string GraphExecutor::DebugStr() const {
       }
       os << '\n';
     }
-    for (size_t j = 0; j < op_nodes_[nid].resources.size(); ++j) {
-      const ResourceEntry &entry = op_nodes_[nid].resources[j];
+    for (size_t j = 0; j < op_nodes_[nid].op_ctx.requested.size(); ++j) {
+      const Resource& resource = op_nodes_[nid].op_ctx.requested[j];
       os << "\tresource[" << j << "]: ";
-      if (entry.resource.req.type == ResourceRequest::kTempSpace) {
-        os << "type=TempSpace, size=" << entry.resource.req.space_num_reals
-           << ", storage_id=" << entry.storage_id;
-      } else if (entry.resource.req.type == ResourceRequest::kRandom) {
+      if (resource.req.type == ResourceRequest::kTempSpace) {
+        os << "type=TempSpace, id=" << resource.id;
+      } else if (resource.req.type == ResourceRequest::kRandom) {
         os << "type=RandomNumber";
       }
       os << '\n';
     }
   }
   os << "Total " << (total_allocated_reals_ >> 18UL) <<" MB allocated\n";
-  return os.str();
 }
 
 void GraphExecutor::Forward(bool is_train) {
diff --git a/src/symbol/graph_executor.h b/src/symbol/graph_executor.h
index a2aafa798669..2f32e34dc31f 100644
--- a/src/symbol/graph_executor.h
+++ b/src/symbol/graph_executor.h
@@ -20,11 +20,12 @@ namespace mxnet {
 class GraphExecutor : public Executor {
  public:
   virtual ~GraphExecutor();
-  virtual void Forward(bool is_train);
-  virtual void Backward(const std::vector<NDArray> &head_grads);
-  virtual const std::vector<NDArray> &outputs() const {
+  void Forward(bool is_train) override;
+  void Backward(const std::vector<NDArray> &head_grads) override;
+  const std::vector<NDArray> &outputs() const override {
     return heads_ndarray_;
   }
+  void Print(std::ostream &os) const override; // NOLINT(*)
   // implement Executor::Bind, only call it once.
   inline void Init(Symbol symbol,
                    Context ctx,
@@ -42,11 +43,8 @@ class GraphExecutor : public Executor {
     this->InitGraph(symbol, ctx, need_backward);
     this->InitDataEntryInfo(in_args, arg_grad_store, grad_req_type, aux_states);
     this->InitDataEntryMemory();
+    this->InitResources();
     this->InitOpNodes();
-    // TODO(bing): remove me when things are OK
-    // LOG(INFO) << "-----Execution memory plan-----\n"
-    //           << DebugStr() << '\n'
-    //           << "------------------------------\n";
   }
 
  protected:
@@ -92,15 +90,6 @@ class GraphExecutor : public Executor {
           storage_id(GraphStorageAllocator::kBadStorageID),
           temp_ref_count(0), ref_count(0) {}
   };
-  // information of the resource
-  struct ResourceEntry {
-    /*! \brief the actual resource */
-    Resource resource;
-    /*! \brief actual data for the entry if it is a temp space */
-    NDArray data;
-    // storage id from allocator if it is a temp space
-    GraphStorageAllocator::StorageID storage_id;
-  };
   // all the information needed to push the op to engine
   struct OpExecEntry {
     // execution function for
@@ -122,8 +111,6 @@ class GraphExecutor : public Executor {
     std::vector<DataEntryInfo> outputs;
     // auxiliary data information of op
     std::vector<DataEntryInfo> aux_states;
-    // resource entry
-    std::vector<ResourceEntry> resources;
     // The following parts are constructed in InitOpNodes
     // the real operator
     std::shared_ptr<Operator> op;
@@ -180,12 +167,12 @@ class GraphExecutor : public Executor {
                          const std::vector<NDArray> &aux_states);
   // initialize internal data entries NDArray
   void InitDataEntryMemory();
+  // initialize the internal resources for each op
+  void InitResources();
   // initialize OpNode data structure
   void InitOpNodes();
   // run ops from topo order start to end
   void RunOps(bool is_train, size_t topo_start, size_t topo_end);
-  // get debug string
-  std::string DebugStr() const;
   // internal computational graph
   StaticGraph graph_;
   // topological order of nodes in computation graph
diff --git a/tests/python/train/test_conv.py b/tests/python/train/test_conv.py
index d9f737402e7a..f5f19982dd9f 100644
--- a/tests/python/train/test_conv.py
+++ b/tests/python/train/test_conv.py
@@ -50,9 +50,10 @@ def CalAcc(out, label):
 
 # bind executer
 # TODO(bing): think of a better bind interface
-executor = softmax.bind(mx.Context('cpu'), arg_narrays, grad_narrays, 'write', aux_narrays)
-# update
 
+executor = softmax.bind(mx.cpu(), arg_narrays, grad_narrays, 'write', aux_narrays)
+# update
+print executor.debug_str()
 out_narray = executor.outputs[0]
 grad_narray = mx.nd.empty(out_narray.shape)