change temp space allocation to dynamic size

apache · Sep 16, 2015 · d91a5fc · d91a5fc
1 parent bc8ffb2
commit d91a5fc
Show file tree

Hide file tree

Showing 12 changed files with 184 additions and 105 deletions.
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
@@ -483,6 +483,13 @@ MXNET_DLL int MXSymbolInferShape(SymbolHandle sym,
 //--------------------------------------------
 // Part 4: Executor interface
 //--------------------------------------------
+/*!
+ * \brief Print the content of execution plan, used for debug.
+ * \param handle the executor.
+ * \param out_str pointer to hold the output string of the printing.
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXExecutorPrint(ExecutorHandle symbol, const char **out_str);
 /*!
  * \brief Executor forward method
  *

diff --git a/include/mxnet/resource.h b/include/mxnet/resource.h
@@ -20,21 +20,19 @@ struct ResourceRequest {
   enum Type {
     /*! \brief mshadow::Random<xpu> object */
     kRandom,
-    /*! \brief Temporal space */
+    /*! \brief A dynamic temp space that can be arbitrary size */
     kTempSpace
   };
   /*! \brief type of resources */
   Type type;
-  /*! \brief size of space requested, in terms of number of reals */
-  size_t space_num_reals;
   /*! \brief default constructor */
   ResourceRequest() {}
   /*!
    * \brief constructor, allow implicit conversion
    * \param type type of resources
    */
-  ResourceRequest(Type type, size_t space_num_reals = 0)  // NOLINT(*)
-      : type(type), space_num_reals(space_num_reals) {}
+  ResourceRequest(Type type)  // NOLINT(*)
+      : type(type) {}
 };
 
 
@@ -48,11 +46,15 @@ struct Resource {
   ResourceRequest req;
   /*! \brief engine variable */
   engine::VarHandle var;
+  /*! \brief identifier of id information, used for debug purpose */
+  int32_t id;
   /*!
    * \brief pointer to the resource, do not use directly,
    *  access using member functions
    */
   void *ptr_;
+  /*! \brief default constructor */
+  Resource() : id(0) {}
   /*!
    * \brief Get random number generator.
    * \param The stream to use in the random number generator.
@@ -70,7 +72,8 @@ struct Resource {
   }
   /*!
    * \brief Get space requested as mshadow Tensor.
-   *  The resulting tensor must fit in space requsted.
+   *  The caller can request arbitrary size.
+   *
    * \param shape the Shape of returning tensor.
    * \param stream the stream of retruning tensor.
    * \return the mshadow tensor requested.
@@ -81,9 +84,11 @@ struct Resource {
   inline mshadow::Tensor<xpu, ndim, real_t> get_space(
       mshadow::Shape<ndim> shape, mshadow::Stream<xpu> *stream) const {
     CHECK_EQ(req.type, ResourceRequest::kTempSpace);
-    CHECK_GE(req.space_num_reals, shape.Size());
+    mshadow::TensorContainer<xpu, 1, real_t> *space =
+        static_cast<mshadow::TensorContainer<xpu, 1, real_t>*>(ptr_);
+    space->Resize(mshadow::Shape1(shape.Size()));
     return mshadow::Tensor<xpu, ndim, real_t>(
-        static_cast<real_t*>(ptr_), shape, shape[ndim - 1], stream);
+        space->dptr_, shape, shape[ndim - 1], stream);
   }
 };
 
@@ -97,7 +102,6 @@ class ResourceManager {
    * \return the requested resource.
    * \note The returned resource's ownership is
    *       still hold by the manager singleton.
-   *
    */
   virtual Resource Request(Context ctx, const ResourceRequest &req) = 0;
   /*!

diff --git a/include/mxnet/symbolic.h b/include/mxnet/symbolic.h
@@ -400,6 +400,11 @@ class Executor {
    * \param head_grads the gradient of head nodes to be backproped.
    */
   virtual void Backward(const std::vector<NDArray> &head_grads) = 0;
+  /*!
+   * \brief print the execution plan info to output stream.
+   * \param os the output stream we like to print to.
+   */
+  virtual void Print(std::ostream &os) const {} // NOLINT(*)
   /*!
    * \brief get array of outputs in the executor.
    * \return array of outputs in the executor.

diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
@@ -5,8 +5,8 @@
 
 import ctypes
 from .base import _LIB
-from .base import c_array, mx_uint, NDArrayHandle, ExecutorHandle
-from .base import check_call
+from .base import mx_uint, NDArrayHandle, ExecutorHandle
+from .base import check_call, c_array, py_str
 from .ndarray import NDArray
 
 class Executor(object):
@@ -81,6 +81,19 @@ def backward(self, head_grads=None):
         ndarray = c_array(NDArrayHandle, [item.handle for item in head_grads])
         check_call(_LIB.MXExecutorBackward(self.handle, len(head_grads), ndarray))
 
+    def debug_str(self):
+        """Get a debug string about internal execution plan.
+
+        Returns
+        -------
+        debug_str : string
+            Debug string of the executor.
+        """
+        debug_str = ctypes.c_char_p()
+        check_call(_LIB.MXExecutorPrint(
+            self.handle, ctypes.byref(debug_str)))
+        return py_str(debug_str.value)
+
     @property
     def outputs(self):
         """list all heads' output ndarray

diff --git a/src/c_api.cc b/src/c_api.cc
@@ -684,6 +684,17 @@ int MXSymbolInferShape(SymbolHandle sym,
   API_END();
 }
 
+int MXExecutorPrint(ExecutorHandle handle, const char **out_str) {
+  Executor *exec = static_cast<Executor*>(handle);
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  API_BEGIN();
+  std::ostringstream os;
+  exec->Print(os);
+  ret->ret_str = os.str();
+  *out_str = (ret->ret_str).c_str();
+  API_END();
+}
+
 int MXExecutorForward(ExecutorHandle handle, bool is_train) {
   API_BEGIN();
   Executor *exec = static_cast<Executor*>(handle);

diff --git a/src/engine/stream_manager.h b/src/engine/stream_manager.h
@@ -75,8 +75,9 @@ template <std::size_t kNumGpus, std::size_t kStreams>
 RunContext StreamManager<kNumGpus, kStreams>::GetIORunContext(
     Context const& ctx) {
   RunContext ret;
+  ret.stream = nullptr;
   switch (ctx.dev_mask) {
-    case cpu::kDevMask: ret.stream = nullptr; break;
+    case cpu::kDevMask: break;
     case gpu::kDevMask: {
 #if MXNET_USE_CUDA
       CUDA_CALL(cudaSetDevice(ctx.dev_id));

diff --git a/src/operator/batch_norm-inl.h b/src/operator/batch_norm-inl.h
@@ -238,9 +238,7 @@ class BatchNormProp : public OperatorProperty {
 
   std::vector<ResourceRequest> BackwardResource(
       const std::vector<TShape> &in_shape) const override {
-    const TShape &dshape = in_shape[0];
-    size_t nspace = dshape[1] * 3;
-    return {{ResourceRequest::kTempSpace, nspace}};
+    return {ResourceRequest::kTempSpace};
   }
 
   int NumVisibleOutputs() const override {

diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
@@ -348,12 +348,12 @@ class ConvolutionProp : public OperatorProperty {
 
   virtual std::vector<ResourceRequest> ForwardResource(
       const std::vector<TShape> &in_shape) const {
-    return {{ResourceRequest::kTempSpace, param_.workspace}};
+    return {ResourceRequest::kTempSpace};
   }
 
   virtual std::vector<ResourceRequest> BackwardResource(
       const std::vector<TShape> &in_shape) const {
-    return {{ResourceRequest::kTempSpace, param_.workspace}};
+    return {ResourceRequest::kTempSpace};
   }
 
   Operator* CreateOperator(Context ctx) const;

diff --git a/src/resource.cc b/src/resource.cc
@@ -4,9 +4,12 @@
  * \brief Implementation of resource manager.
  */
 #include <dmlc/logging.h>
+#include <dmlc/parameter.h>
 #include <mxnet/base.h>
 #include <mxnet/engine.h>
 #include <mxnet/resource.h>
+#include <limits>
+#include <atomic>
 #include "./common/lazy_alloc_array.h"
 
 namespace mxnet {
@@ -15,10 +18,15 @@ namespace resource {
 // implements resource manager
 class ResourceManagerImpl : public ResourceManager {
  public:
-  ResourceManagerImpl() : global_seed_(0) {
+  ResourceManagerImpl() noexcept(false)
+      : global_seed_(0) {
+    cpu_temp_space_copy_ = dmlc::GetEnv("MXNET_CPU_TEMP_COPY", 16);
+    gpu_temp_space_copy_ = dmlc::GetEnv("MXNET_GPU_TEMP_COPY", 4);
     engine_ref_ = Engine::_GetSharedRef();
     cpu_rand_ = new ResourceRandom<cpu>(
         Context(cpu::kDevMask, 0), global_seed_);
+    cpu_space_ = new ResourceTempSpace<cpu>(
+        Context(cpu::kDevMask, 0), cpu_temp_space_copy_);
   }
   ~ResourceManagerImpl() {
     // need explicit delete, before engine get killed
@@ -32,21 +40,31 @@ class ResourceManagerImpl : public ResourceManager {
 
   // request resources
   Resource Request(Context ctx, const ResourceRequest &req) override {
-    if (req.type == ResourceRequest::kRandom) {
-      if (ctx.dev_mask == cpu::kDevMask) {
-        return cpu_rand_->resource;
-      } else {
-        CHECK_EQ(ctx.dev_mask, gpu::kDevMask);
+    if (ctx.dev_mask == cpu::kDevMask) {
+      switch (req.type) {
+        case ResourceRequest::kRandom: return cpu_rand_->resource;
+        case ResourceRequest::kTempSpace: return cpu_space_->GetNext();
+        default: LOG(FATAL) << "Unknown supported type " << req.type;
+      }
+    } else {
+      CHECK_EQ(ctx.dev_mask, gpu::kDevMask);
 #if MSHADOW_USE_CUDA
-        return gpu_rand_.Get(ctx.dev_id, [ctx, this]() {
-            return new ResourceRandom<gpu>(ctx, global_seed_);
-          })->resource;
+      switch (req.type) {
+        case ResourceRequest::kRandom: {
+          return gpu_rand_.Get(ctx.dev_id, [ctx, this]() {
+              return new ResourceRandom<gpu>(ctx, global_seed_);
+            })->resource;
+        }
+        case ResourceRequest::kTempSpace: {
+          return gpu_space_.Get(ctx.dev_id, [ctx, this]() {
+              return new ResourceTempSpace<gpu>(ctx, gpu_temp_space_copy_);
+            })->GetNext();
+        }
+        default: LOG(FATAL) << "Unknown supported type " << req.type;
+      }
 #else
-        LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+      LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
 #endif
-      }
-    } else {
-      LOG(FATAL) << "Unknown supported type " << req.type;
     }
     Resource ret;
     return ret;
@@ -67,16 +85,13 @@ class ResourceManagerImpl : public ResourceManager {
   static constexpr std::size_t kMaxNumGPUs = 16;
   /*! \brief Random number magic number to seed different random numbers */
   static constexpr uint32_t kRandMagic = 127UL;
-  /*! \brief Reference to the engine */
-  std::shared_ptr<Engine> engine_ref_;
-
   // the random number resources
   template<typename xpu>
   struct ResourceRandom {
-    /*! \brief pointer to PRNG */
-    mshadow::Random<xpu> *prnd;
     /*! \brief the context of the PRNG */
     Context ctx;
+    /*! \brief pointer to PRNG */
+    mshadow::Random<xpu> *prnd;
     /*! \brief resource representation */
     Resource resource;
     /*! \brief constructor */
@@ -103,13 +118,65 @@ class ResourceManagerImpl : public ResourceManager {
         }, ctx, {}, {resource.var});
     }
   };
+  // temporal space resource.
+  template<typename xpu>
+  struct ResourceTempSpace {
+    /*! \brief the context of the device */
+    Context ctx;
+    /*! \brief the underlying space */
+    std::vector<mshadow::TensorContainer<xpu, 1, real_t>*> space;
+    /*! \brief resource representation */
+    std::vector<Resource> resource;
+    /*! \brief current pointer to the round roubin alloator */
+    std::atomic<size_t> curr_ptr;
+    /*! \brief constructor */
+    explicit ResourceTempSpace(Context ctx, size_t ncopy)
+        : ctx(ctx), space(ncopy), resource(ncopy), curr_ptr(0) {
+      mshadow::SetDevice<xpu>(ctx.dev_id);
+      for (size_t i = 0; i < space.size(); ++i) {
+        space[i] = new mshadow::TensorContainer<xpu, 1, real_t>();
+        resource[i].var = Engine::Get()->NewVariable();
+        resource[i].id = static_cast<int32_t>(i);
+        resource[i].ptr_ = space[i];
+        resource[i].req = ResourceRequest(ResourceRequest::kTempSpace);
+      }
+    }
+    ~ResourceTempSpace() {
+      for (size_t i = 0; i < space.size(); ++i) {
+        mshadow::TensorContainer<xpu, 1, real_t>* r = space[i];
+        Engine::Get()->DeleteVariable(
+            [r](RunContext rctx){ delete r; }, ctx, resource[i].var);
+      }
+    }
+    // get next resource in round roubin matter
+    inline Resource GetNext() {
+      const size_t kMaxDigit = std::numeric_limits<size_t>::max() / 2;
+      size_t ptr = ++curr_ptr;
+      // reset ptr to avoid undefined behavior during overflow
+      // usually this won't happen
+      if (ptr > kMaxDigit) {
+        curr_ptr.store((ptr + 1) % space.size());
+      }
+      return resource[ptr % space.size()];
+    }
+  };
+  /*! \brief number of copies in CPU temp space */
+  int cpu_temp_space_copy_;
+  /*! \brief number of copies in GPU temp space */
+  int gpu_temp_space_copy_;
+  /*! \brief Reference to the engine */
+  std::shared_ptr<Engine> engine_ref_;
   /*! \brief internal seed to the random number generator */
   uint32_t global_seed_;
   /*! \brief CPU random number resources */
   ResourceRandom<cpu> *cpu_rand_;
+  /*! \brief CPU temp space resources */
+  ResourceTempSpace<cpu> *cpu_space_;
 #if MXNET_USE_CUDA
   /*! \brief random number generator for GPU */
   common::LazyAllocArray<ResourceRandom<gpu> > gpu_rand_;
+  /*! \brief temp space for GPU */
+  common::LazyAllocArray<ResourceTempSpace<gpu> > gpu_space_;
 #endif
 };
 }  // namespace resource