From d91a5fc316e450d0d300c00f57ec8da4bc45b3d8 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 16 Sep 2015 12:01:32 -0700 Subject: [PATCH] change temp space allocation to dynamic size --- include/mxnet/c_api.h | 7 +++ include/mxnet/resource.h | 22 ++++--- include/mxnet/symbolic.h | 5 ++ python/mxnet/executor.py | 17 +++++- src/c_api.cc | 11 ++++ src/engine/stream_manager.h | 3 +- src/operator/batch_norm-inl.h | 4 +- src/operator/convolution-inl.h | 4 +- src/resource.cc | 103 ++++++++++++++++++++++++++------ src/symbol/graph_executor.cc | 81 ++++++++++--------------- src/symbol/graph_executor.h | 27 +++------ tests/python/train/test_conv.py | 5 +- 12 files changed, 184 insertions(+), 105 deletions(-) diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h index f6122c1f37b6..c954cabadac0 100644 --- a/include/mxnet/c_api.h +++ b/include/mxnet/c_api.h @@ -483,6 +483,13 @@ MXNET_DLL int MXSymbolInferShape(SymbolHandle sym, //-------------------------------------------- // Part 4: Executor interface //-------------------------------------------- +/*! + * \brief Print the content of execution plan, used for debug. + * \param handle the executor. + * \param out_str pointer to hold the output string of the printing. + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXExecutorPrint(ExecutorHandle symbol, const char **out_str); /*! * \brief Executor forward method * diff --git a/include/mxnet/resource.h b/include/mxnet/resource.h index a6c61f6f8862..d85299960511 100644 --- a/include/mxnet/resource.h +++ b/include/mxnet/resource.h @@ -20,21 +20,19 @@ struct ResourceRequest { enum Type { /*! \brief mshadow::Random object */ kRandom, - /*! \brief Temporal space */ + /*! \brief A dynamic temp space that can be arbitrary size */ kTempSpace }; /*! \brief type of resources */ Type type; - /*! \brief size of space requested, in terms of number of reals */ - size_t space_num_reals; /*! \brief default constructor */ ResourceRequest() {} /*! * \brief constructor, allow implicit conversion * \param type type of resources */ - ResourceRequest(Type type, size_t space_num_reals = 0) // NOLINT(*) - : type(type), space_num_reals(space_num_reals) {} + ResourceRequest(Type type) // NOLINT(*) + : type(type) {} }; @@ -48,11 +46,15 @@ struct Resource { ResourceRequest req; /*! \brief engine variable */ engine::VarHandle var; + /*! \brief identifier of id information, used for debug purpose */ + int32_t id; /*! * \brief pointer to the resource, do not use directly, * access using member functions */ void *ptr_; + /*! \brief default constructor */ + Resource() : id(0) {} /*! * \brief Get random number generator. * \param The stream to use in the random number generator. @@ -70,7 +72,8 @@ struct Resource { } /*! * \brief Get space requested as mshadow Tensor. - * The resulting tensor must fit in space requsted. + * The caller can request arbitrary size. + * * \param shape the Shape of returning tensor. * \param stream the stream of retruning tensor. * \return the mshadow tensor requested. @@ -81,9 +84,11 @@ struct Resource { inline mshadow::Tensor get_space( mshadow::Shape shape, mshadow::Stream *stream) const { CHECK_EQ(req.type, ResourceRequest::kTempSpace); - CHECK_GE(req.space_num_reals, shape.Size()); + mshadow::TensorContainer *space = + static_cast*>(ptr_); + space->Resize(mshadow::Shape1(shape.Size())); return mshadow::Tensor( - static_cast(ptr_), shape, shape[ndim - 1], stream); + space->dptr_, shape, shape[ndim - 1], stream); } }; @@ -97,7 +102,6 @@ class ResourceManager { * \return the requested resource. * \note The returned resource's ownership is * still hold by the manager singleton. - * */ virtual Resource Request(Context ctx, const ResourceRequest &req) = 0; /*! diff --git a/include/mxnet/symbolic.h b/include/mxnet/symbolic.h index 9f1f21d69d59..ad01040007a7 100644 --- a/include/mxnet/symbolic.h +++ b/include/mxnet/symbolic.h @@ -400,6 +400,11 @@ class Executor { * \param head_grads the gradient of head nodes to be backproped. */ virtual void Backward(const std::vector &head_grads) = 0; + /*! + * \brief print the execution plan info to output stream. + * \param os the output stream we like to print to. + */ + virtual void Print(std::ostream &os) const {} // NOLINT(*) /*! * \brief get array of outputs in the executor. * \return array of outputs in the executor. diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py index f57077adc919..27511f5ff38e 100644 --- a/python/mxnet/executor.py +++ b/python/mxnet/executor.py @@ -5,8 +5,8 @@ import ctypes from .base import _LIB -from .base import c_array, mx_uint, NDArrayHandle, ExecutorHandle -from .base import check_call +from .base import mx_uint, NDArrayHandle, ExecutorHandle +from .base import check_call, c_array, py_str from .ndarray import NDArray class Executor(object): @@ -81,6 +81,19 @@ def backward(self, head_grads=None): ndarray = c_array(NDArrayHandle, [item.handle for item in head_grads]) check_call(_LIB.MXExecutorBackward(self.handle, len(head_grads), ndarray)) + def debug_str(self): + """Get a debug string about internal execution plan. + + Returns + ------- + debug_str : string + Debug string of the executor. + """ + debug_str = ctypes.c_char_p() + check_call(_LIB.MXExecutorPrint( + self.handle, ctypes.byref(debug_str))) + return py_str(debug_str.value) + @property def outputs(self): """list all heads' output ndarray diff --git a/src/c_api.cc b/src/c_api.cc index 222440f76855..c453f743aba3 100644 --- a/src/c_api.cc +++ b/src/c_api.cc @@ -684,6 +684,17 @@ int MXSymbolInferShape(SymbolHandle sym, API_END(); } +int MXExecutorPrint(ExecutorHandle handle, const char **out_str) { + Executor *exec = static_cast(handle); + MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get(); + API_BEGIN(); + std::ostringstream os; + exec->Print(os); + ret->ret_str = os.str(); + *out_str = (ret->ret_str).c_str(); + API_END(); +} + int MXExecutorForward(ExecutorHandle handle, bool is_train) { API_BEGIN(); Executor *exec = static_cast(handle); diff --git a/src/engine/stream_manager.h b/src/engine/stream_manager.h index adc84cdf7e9e..b9303e86f08c 100644 --- a/src/engine/stream_manager.h +++ b/src/engine/stream_manager.h @@ -75,8 +75,9 @@ template RunContext StreamManager::GetIORunContext( Context const& ctx) { RunContext ret; + ret.stream = nullptr; switch (ctx.dev_mask) { - case cpu::kDevMask: ret.stream = nullptr; break; + case cpu::kDevMask: break; case gpu::kDevMask: { #if MXNET_USE_CUDA CUDA_CALL(cudaSetDevice(ctx.dev_id)); diff --git a/src/operator/batch_norm-inl.h b/src/operator/batch_norm-inl.h index 9ab15d4abe7a..c6bca12bd0db 100644 --- a/src/operator/batch_norm-inl.h +++ b/src/operator/batch_norm-inl.h @@ -238,9 +238,7 @@ class BatchNormProp : public OperatorProperty { std::vector BackwardResource( const std::vector &in_shape) const override { - const TShape &dshape = in_shape[0]; - size_t nspace = dshape[1] * 3; - return {{ResourceRequest::kTempSpace, nspace}}; + return {ResourceRequest::kTempSpace}; } int NumVisibleOutputs() const override { diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h index 4ce017270a68..f26265f3f3f1 100644 --- a/src/operator/convolution-inl.h +++ b/src/operator/convolution-inl.h @@ -348,12 +348,12 @@ class ConvolutionProp : public OperatorProperty { virtual std::vector ForwardResource( const std::vector &in_shape) const { - return {{ResourceRequest::kTempSpace, param_.workspace}}; + return {ResourceRequest::kTempSpace}; } virtual std::vector BackwardResource( const std::vector &in_shape) const { - return {{ResourceRequest::kTempSpace, param_.workspace}}; + return {ResourceRequest::kTempSpace}; } Operator* CreateOperator(Context ctx) const; diff --git a/src/resource.cc b/src/resource.cc index 62d0b0080fc3..e3d1771f6d32 100644 --- a/src/resource.cc +++ b/src/resource.cc @@ -4,9 +4,12 @@ * \brief Implementation of resource manager. */ #include +#include #include #include #include +#include +#include #include "./common/lazy_alloc_array.h" namespace mxnet { @@ -15,10 +18,15 @@ namespace resource { // implements resource manager class ResourceManagerImpl : public ResourceManager { public: - ResourceManagerImpl() : global_seed_(0) { + ResourceManagerImpl() noexcept(false) + : global_seed_(0) { + cpu_temp_space_copy_ = dmlc::GetEnv("MXNET_CPU_TEMP_COPY", 16); + gpu_temp_space_copy_ = dmlc::GetEnv("MXNET_GPU_TEMP_COPY", 4); engine_ref_ = Engine::_GetSharedRef(); cpu_rand_ = new ResourceRandom( Context(cpu::kDevMask, 0), global_seed_); + cpu_space_ = new ResourceTempSpace( + Context(cpu::kDevMask, 0), cpu_temp_space_copy_); } ~ResourceManagerImpl() { // need explicit delete, before engine get killed @@ -32,21 +40,31 @@ class ResourceManagerImpl : public ResourceManager { // request resources Resource Request(Context ctx, const ResourceRequest &req) override { - if (req.type == ResourceRequest::kRandom) { - if (ctx.dev_mask == cpu::kDevMask) { - return cpu_rand_->resource; - } else { - CHECK_EQ(ctx.dev_mask, gpu::kDevMask); + if (ctx.dev_mask == cpu::kDevMask) { + switch (req.type) { + case ResourceRequest::kRandom: return cpu_rand_->resource; + case ResourceRequest::kTempSpace: return cpu_space_->GetNext(); + default: LOG(FATAL) << "Unknown supported type " << req.type; + } + } else { + CHECK_EQ(ctx.dev_mask, gpu::kDevMask); #if MSHADOW_USE_CUDA - return gpu_rand_.Get(ctx.dev_id, [ctx, this]() { - return new ResourceRandom(ctx, global_seed_); - })->resource; + switch (req.type) { + case ResourceRequest::kRandom: { + return gpu_rand_.Get(ctx.dev_id, [ctx, this]() { + return new ResourceRandom(ctx, global_seed_); + })->resource; + } + case ResourceRequest::kTempSpace: { + return gpu_space_.Get(ctx.dev_id, [ctx, this]() { + return new ResourceTempSpace(ctx, gpu_temp_space_copy_); + })->GetNext(); + } + default: LOG(FATAL) << "Unknown supported type " << req.type; + } #else - LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; #endif - } - } else { - LOG(FATAL) << "Unknown supported type " << req.type; } Resource ret; return ret; @@ -67,16 +85,13 @@ class ResourceManagerImpl : public ResourceManager { static constexpr std::size_t kMaxNumGPUs = 16; /*! \brief Random number magic number to seed different random numbers */ static constexpr uint32_t kRandMagic = 127UL; - /*! \brief Reference to the engine */ - std::shared_ptr engine_ref_; - // the random number resources template struct ResourceRandom { - /*! \brief pointer to PRNG */ - mshadow::Random *prnd; /*! \brief the context of the PRNG */ Context ctx; + /*! \brief pointer to PRNG */ + mshadow::Random *prnd; /*! \brief resource representation */ Resource resource; /*! \brief constructor */ @@ -103,13 +118,65 @@ class ResourceManagerImpl : public ResourceManager { }, ctx, {}, {resource.var}); } }; + // temporal space resource. + template + struct ResourceTempSpace { + /*! \brief the context of the device */ + Context ctx; + /*! \brief the underlying space */ + std::vector*> space; + /*! \brief resource representation */ + std::vector resource; + /*! \brief current pointer to the round roubin alloator */ + std::atomic curr_ptr; + /*! \brief constructor */ + explicit ResourceTempSpace(Context ctx, size_t ncopy) + : ctx(ctx), space(ncopy), resource(ncopy), curr_ptr(0) { + mshadow::SetDevice(ctx.dev_id); + for (size_t i = 0; i < space.size(); ++i) { + space[i] = new mshadow::TensorContainer(); + resource[i].var = Engine::Get()->NewVariable(); + resource[i].id = static_cast(i); + resource[i].ptr_ = space[i]; + resource[i].req = ResourceRequest(ResourceRequest::kTempSpace); + } + } + ~ResourceTempSpace() { + for (size_t i = 0; i < space.size(); ++i) { + mshadow::TensorContainer* r = space[i]; + Engine::Get()->DeleteVariable( + [r](RunContext rctx){ delete r; }, ctx, resource[i].var); + } + } + // get next resource in round roubin matter + inline Resource GetNext() { + const size_t kMaxDigit = std::numeric_limits::max() / 2; + size_t ptr = ++curr_ptr; + // reset ptr to avoid undefined behavior during overflow + // usually this won't happen + if (ptr > kMaxDigit) { + curr_ptr.store((ptr + 1) % space.size()); + } + return resource[ptr % space.size()]; + } + }; + /*! \brief number of copies in CPU temp space */ + int cpu_temp_space_copy_; + /*! \brief number of copies in GPU temp space */ + int gpu_temp_space_copy_; + /*! \brief Reference to the engine */ + std::shared_ptr engine_ref_; /*! \brief internal seed to the random number generator */ uint32_t global_seed_; /*! \brief CPU random number resources */ ResourceRandom *cpu_rand_; + /*! \brief CPU temp space resources */ + ResourceTempSpace *cpu_space_; #if MXNET_USE_CUDA /*! \brief random number generator for GPU */ common::LazyAllocArray > gpu_rand_; + /*! \brief temp space for GPU */ + common::LazyAllocArray > gpu_space_; #endif }; } // namespace resource diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc index a30a4c0d897c..82071da15425 100644 --- a/src/symbol/graph_executor.cc +++ b/src/symbol/graph_executor.cc @@ -411,21 +411,6 @@ void GraphExecutor::InitDataEntryMemory() { out->type = kInternalAllocated; } } - // resource - const std::vector& reqs = GetResource(nid); - op_nodes_[nid].resources.resize(reqs.size()); - for (uint32_t i = 0; i < reqs.size(); ++i) { - op_nodes_[nid].resources[i].resource.req = reqs[i]; - } - // allocate resource - for (ResourceEntry& entry : op_nodes_[nid].resources) { - if (entry.resource.req.type == ResourceRequest::kTempSpace) { - entry.storage_id = - allocator.Request(op_nodes_[nid].ctx, - mshadow::Shape1(entry.resource.req.space_num_reals), - nid); - } - } // then free inputs for (DataEntryInfo *in : in_data) { // temp_ref_count == 0 means it is taken by inplace op @@ -445,12 +430,6 @@ void GraphExecutor::InitDataEntryMemory() { allocator.Release(out->storage_id, nid); } } - // release the resource, as soon as the forward is finished we can release it. - for (ResourceEntry& res : op_nodes_[nid].resources) { - if (res.resource.req.type == ResourceRequest::kTempSpace) { - allocator.Release(res.storage_id, nid); - } - } } // one pass complete, allocate real memory this->total_allocated_reals_ = allocator.InitStorages(); @@ -464,25 +443,8 @@ void GraphExecutor::InitDataEntryMemory() { out.data = allocator.Get(out.storage_id, out.shape); } } - // Get the resource of temporal space. - for (ResourceEntry& entry : op_nodes_[nid].resources) { - if (entry.resource.req.type == ResourceRequest::kTempSpace) { - entry.data = allocator.Get(entry.storage_id, - mshadow::Shape1(entry.resource.req.space_num_reals)); - entry.resource.ptr_ = entry.data.data().dptr_; - entry.resource.var = entry.data.var(); - } else if (entry.resource.req.type == ResourceRequest::kRandom) { - entry.resource = ResourceManager::Get()->Request( - op_nodes_[nid].ctx, entry.resource.req); - } else { - LOG(FATAL) << "resource type not yet supported"; - } - op_nodes_[nid].op_ctx.requested.resize(op_nodes_[nid].resources.size()); - for (size_t i = 0; i < op_nodes_[nid].resources.size(); ++i) { - op_nodes_[nid].op_ctx.requested[i] = op_nodes_[nid].resources[i].resource; - } - } } + // setup heads for (StaticGraph::DataEntry e : graph_.heads) { DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index]; CHECK_EQ(info.type, kInternalAllocated); @@ -490,6 +452,32 @@ void GraphExecutor::InitDataEntryMemory() { } } +void GraphExecutor::InitResources() { + // Resource allocation + for (size_t i = 0; i < topo_order_.size(); ++i) { + uint32_t nid = topo_order_[i]; + if (!op_nodes_[nid].activated) continue; + if (graph_.nodes[nid].is_variable()) continue; + + const std::vector& reqs = GetResource(nid); + auto& requested = op_nodes_[nid].op_ctx.requested; + requested.clear(); + // Get the resource of temporal space. + for (const ResourceRequest& req : reqs) { + if (req.type == ResourceRequest::kTempSpace) { + // TODO(tqchen, bing) more smarter graph aware temp sapce allocation. + requested.push_back(ResourceManager::Get()->Request( + op_nodes_[nid].ctx, req)); + } else if (req.type == ResourceRequest::kRandom) { + requested.push_back(ResourceManager::Get()->Request( + op_nodes_[nid].ctx, req)); + } else { + LOG(FATAL) << "resource type not yet supported"; + } + } + } +} + void GraphExecutor::InitOpNodes() { for (size_t i = 0; i < topo_order_.size(); ++i) { uint32_t nid = topo_order_[i]; @@ -544,8 +532,7 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) { } } -std::string GraphExecutor::DebugStr() const { - std::ostringstream os; +void GraphExecutor::Print(std::ostream &os) const { os << "num_forward_nodes=" << num_forward_nodes_ << '\n'; for (size_t i = 0; i < topo_order_.size(); ++i) { uint32_t nid = topo_order_[i]; @@ -562,20 +549,18 @@ std::string GraphExecutor::DebugStr() const { } os << '\n'; } - for (size_t j = 0; j < op_nodes_[nid].resources.size(); ++j) { - const ResourceEntry &entry = op_nodes_[nid].resources[j]; + for (size_t j = 0; j < op_nodes_[nid].op_ctx.requested.size(); ++j) { + const Resource& resource = op_nodes_[nid].op_ctx.requested[j]; os << "\tresource[" << j << "]: "; - if (entry.resource.req.type == ResourceRequest::kTempSpace) { - os << "type=TempSpace, size=" << entry.resource.req.space_num_reals - << ", storage_id=" << entry.storage_id; - } else if (entry.resource.req.type == ResourceRequest::kRandom) { + if (resource.req.type == ResourceRequest::kTempSpace) { + os << "type=TempSpace, id=" << resource.id; + } else if (resource.req.type == ResourceRequest::kRandom) { os << "type=RandomNumber"; } os << '\n'; } } os << "Total " << (total_allocated_reals_ >> 18UL) <<" MB allocated\n"; - return os.str(); } void GraphExecutor::Forward(bool is_train) { diff --git a/src/symbol/graph_executor.h b/src/symbol/graph_executor.h index a2aafa798669..2f32e34dc31f 100644 --- a/src/symbol/graph_executor.h +++ b/src/symbol/graph_executor.h @@ -20,11 +20,12 @@ namespace mxnet { class GraphExecutor : public Executor { public: virtual ~GraphExecutor(); - virtual void Forward(bool is_train); - virtual void Backward(const std::vector &head_grads); - virtual const std::vector &outputs() const { + void Forward(bool is_train) override; + void Backward(const std::vector &head_grads) override; + const std::vector &outputs() const override { return heads_ndarray_; } + void Print(std::ostream &os) const override; // NOLINT(*) // implement Executor::Bind, only call it once. inline void Init(Symbol symbol, Context ctx, @@ -42,11 +43,8 @@ class GraphExecutor : public Executor { this->InitGraph(symbol, ctx, need_backward); this->InitDataEntryInfo(in_args, arg_grad_store, grad_req_type, aux_states); this->InitDataEntryMemory(); + this->InitResources(); this->InitOpNodes(); - // TODO(bing): remove me when things are OK - // LOG(INFO) << "-----Execution memory plan-----\n" - // << DebugStr() << '\n' - // << "------------------------------\n"; } protected: @@ -92,15 +90,6 @@ class GraphExecutor : public Executor { storage_id(GraphStorageAllocator::kBadStorageID), temp_ref_count(0), ref_count(0) {} }; - // information of the resource - struct ResourceEntry { - /*! \brief the actual resource */ - Resource resource; - /*! \brief actual data for the entry if it is a temp space */ - NDArray data; - // storage id from allocator if it is a temp space - GraphStorageAllocator::StorageID storage_id; - }; // all the information needed to push the op to engine struct OpExecEntry { // execution function for @@ -122,8 +111,6 @@ class GraphExecutor : public Executor { std::vector outputs; // auxiliary data information of op std::vector aux_states; - // resource entry - std::vector resources; // The following parts are constructed in InitOpNodes // the real operator std::shared_ptr op; @@ -180,12 +167,12 @@ class GraphExecutor : public Executor { const std::vector &aux_states); // initialize internal data entries NDArray void InitDataEntryMemory(); + // initialize the internal resources for each op + void InitResources(); // initialize OpNode data structure void InitOpNodes(); // run ops from topo order start to end void RunOps(bool is_train, size_t topo_start, size_t topo_end); - // get debug string - std::string DebugStr() const; // internal computational graph StaticGraph graph_; // topological order of nodes in computation graph diff --git a/tests/python/train/test_conv.py b/tests/python/train/test_conv.py index d9f737402e7a..f5f19982dd9f 100644 --- a/tests/python/train/test_conv.py +++ b/tests/python/train/test_conv.py @@ -50,9 +50,10 @@ def CalAcc(out, label): # bind executer # TODO(bing): think of a better bind interface -executor = softmax.bind(mx.Context('cpu'), arg_narrays, grad_narrays, 'write', aux_narrays) -# update +executor = softmax.bind(mx.cpu(), arg_narrays, grad_narrays, 'write', aux_narrays) +# update +print executor.debug_str() out_narray = executor.outputs[0] grad_narray = mx.nd.empty(out_narray.shape)