Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
Implement storage tagging, the first half of the memory profiler
Browse files Browse the repository at this point in the history
  • Loading branch information
ArmageddonKnight committed Mar 9, 2020
1 parent 48e9e2c commit d37d7ba
Show file tree
Hide file tree
Showing 46 changed files with 1,066 additions and 99 deletions.
14 changes: 14 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ Format: Auto | Common | All | LIST(ARCH_AND_PTX ...)
option(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
option(USE_OPENCV "Build with OpenCV support" ON)
option(USE_OPENMP "Build with Openmp support" ON)
cmake_dependent_option(USE_NVML "Build with nvml support if found" ON "USE_CUDA" OFF)
cmake_dependent_option(USE_CUDNN "Build with cudnn support" ON "USE_CUDA" OFF) # one could set CUDNN_ROOT for search path
cmake_dependent_option(USE_NVTX "Build with nvtx support if found" ON "USE_CUDA" OFF)
cmake_dependent_option(USE_SSE "Build with x86 SSE instruction support" ON "NOT ARM" OFF)
Expand Down Expand Up @@ -630,6 +631,19 @@ if(USE_CUDA)
list(APPEND SOURCE ${CUDA})
add_definitions(-DMXNET_USE_CUDA=1)

if(UNIX)
if(USE_NVML)
find_package(NVML)
if(NVML_FOUND)
include_directories(${NVML_INCLUDE_DIRS})
list(APPEND mxnet_LINKER_LIBS ${NVML_LIBRARIES})
add_definitions(-DMXNET_USE_NVML=1)
else()
add_definitions(-DMXNET_USE_NVML=0)
message(WARNING "Could not find NVML libraries")
endif()
endif()
endif()
if(USE_NCCL)
find_package(NCCL)
if(NCCL_FOUND)
Expand Down
7 changes: 7 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,12 @@ ifeq ($(USE_CUDA), 1)
# Make sure to add stubs as fallback in order to be able to build
# without full CUDA install (especially if run without nvidia-docker)
LDFLAGS += -L/usr/local/cuda/lib64/stubs
ifeq ($(USE_NVML), 1)
LDFLAGS += -lnvidia-ml
CFLAGS += -DMXNET_USE_NVML=1
else
CFLAGS += -DMXNET_USE_NVML=0
endif
ifeq ($(USE_NCCL), 1)
ifneq ($(USE_NCCL_PATH), NONE)
CFLAGS += -I$(USE_NCCL_PATH)/include
Expand All @@ -537,6 +543,7 @@ ifeq ($(USE_CUDA), 1)
CFLAGS += -DMXNET_USE_NCCL=0
endif
else
CFLAGS += -DMXNET_USE_NVML=0
CFLAGS += -DMXNET_USE_NCCL=0
endif

Expand Down
8 changes: 4 additions & 4 deletions ci/docker/runtime_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -715,7 +715,7 @@ build_ubuntu_gpu_mkldnn() {
set -ex
cd /work/build
cmake \
-DCMAKE_BUILD_TYPE="RelWithDebInfo" \
-DCMAKE_BUILD_TYPE=Release \
-DUSE_MKL_IF_AVAILABLE=OFF \
-DUSE_TVM_OP=ON \
-DUSE_CUDA=ON \
Expand All @@ -729,7 +729,7 @@ build_ubuntu_gpu_mkldnn_nocudnn() {
set -ex
cd /work/build
cmake \
-DCMAKE_BUILD_TYPE="RelWithDebInfo" \
-DCMAKE_BUILD_TYPE=Release \
-DUSE_MKL_IF_AVAILABLE=OFF \
-DUSE_TVM_OP=ON \
-DUSE_CUDA=ON \
Expand All @@ -744,7 +744,7 @@ build_ubuntu_gpu_cuda101_cudnn7() {
set -ex
cd /work/build
cmake \
-DCMAKE_BUILD_TYPE="RelWithDebInfo" \
-DCMAKE_BUILD_TYPE=Release \
-DUSE_MKL_IF_AVAILABLE=OFF \
-DUSE_TVM_OP=ON \
-DUSE_CUDA=ON \
Expand Down Expand Up @@ -800,7 +800,7 @@ build_ubuntu_gpu_cuda101_cudnn7_no_tvm_op() {
set -ex
cd /work/build
cmake \
-DCMAKE_BUILD_TYPE="RelWithDebInfo" \
-DCMAKE_BUILD_TYPE=Release \
-DUSE_MKL_IF_AVAILABLE=OFF \
-DUSE_TVM_OP=OFF \
-DUSE_CUDA=ON \
Expand Down
84 changes: 84 additions & 0 deletions cmake/Modules/FindNVML.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# Find the nvml libraries
#
# The following variables are optionally searched for defaults
# NVML_ROOT_DIR: Base directory where all NVML components are found
# NVML_INCLUDE_DIR: Directory where NVML header is found
# NVML_LIB_DIR: Directory where NVML library is found
#
# The following are set after configuration is done:
# NVML_FOUND
# NVML_INCLUDE_DIRS
# NVML_LIBRARIES
#
# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
# install NVML in the same location as the CUDA toolkit.
# See https://github.com/caffe2/caffe2/issues/1601

if ($ENV{NVML_ROOT_DIR})
message(WARNING "NVML_ROOT_DIR is deprecated. Please set NVML_ROOT instead.")
endif()

find_path(NVML_INCLUDE_DIRS
NAMES nvml.h
HINTS
${NVML_INCLUDE_DIR}
${NVML_ROOT_DIR}
${NVML_ROOT_DIR}/include
${CUDA_TOOLKIT_ROOT_DIR}/include
$ENV{NVML_DIR}/include
)

find_library(NVML_LIBRARIES
NAMES nvidia-ml
HINTS
${NVML_LIB_DIR}
${NVML_ROOT_DIR}
${NVML_ROOT_DIR}/lib
${NVML_ROOT_DIR}/lib/x86_64-linux-gnu
${NVML_ROOT_DIR}/lib64
${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs
$ENV{NVML_DIR}/lib
)

# if not found in any of the above paths, finally, check in the /usr/local/cuda for UNIX systems
if (UNIX)
set (search_paths "/usr/local/cuda")

find_path(NVML_INCLUDE_DIRS
NAMES nvml.h
PATHS ${search_paths}
PATH_SUFFIXES include
)

find_library(NVML_LIBRARIES
NAMES nvidia-ml
PATHS ${search_paths}
PATH_SUFFIXES lib64/stubs
)
endif()

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NVML DEFAULT_MSG NVML_INCLUDE_DIRS NVML_LIBRARIES)

if(NVML_FOUND)
message(STATUS "Found NVML (include: ${NVML_INCLUDE_DIRS}, library: ${NVML_LIBRARIES})")
mark_as_advanced(NVML_ROOT_DIR NVML_INCLUDE_DIRS NVML_LIBRARIES)
endif()

7 changes: 7 additions & 0 deletions include/mxnet/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,13 @@ MXNET_DLL int MXSetProcessProfilerState(int state, int profile_process,
*/
MXNET_DLL int MXSetProfilerState(int state);

/*!
* \brief Set the scope of profiler for current process
* \param scope indicate the working scope of profiler
* \return 0 when success, -1 when failure happens.
*/
MXNET_DLL int MXSetProfilerScope(const char* scope);

/*!
* \brief Save profile and stop profiler
* \param finished true if stat output should stop after this point
Expand Down
4 changes: 4 additions & 0 deletions include/mxnet/libinfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@
#define MXNET_USE_CUDNN MSHADOW_USE_CUDNN
#endif

#ifndef MXNET_USE_NVML
#define MXNET_USE_NVML 0
#endif

#ifndef MXNET_USE_NCCL
#define MXNET_USE_NCCL 0
#endif
Expand Down
11 changes: 8 additions & 3 deletions include/mxnet/ndarray.h
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,9 @@ class NDArray {
CheckAndAlloc();
return ptr_->shandle;
}
/*! \brief assign profiler scope and name to the storage handles */
void AssignStorageInfo(const std::string& profiler_scope,
const std::string& name);
/*!
* \brief Block until all the pending write operations with respect
* to current NDArray are finished, and read can be performed.
Expand Down Expand Up @@ -989,7 +992,7 @@ class NDArray {
/*! \brief check if delay alloc is on, do alloc if not yet done */
inline void CheckAndAlloc(void) {
if (delay_alloc) {
shandle = Storage::Get()->Alloc(shandle.size, shandle.ctx);
Storage::Get()->Alloc(&shandle);
#if MXNET_USE_MKLDNN == 1
mkl_mem_ = nullptr;
#endif
Expand All @@ -1004,7 +1007,8 @@ class NDArray {
<< "CheckAndAlloc(dbytes) is only intended for kDefaultStorage";
dbytes = std::max(dbytes, static_cast<uint64_t>(shandle.size));
if (delay_alloc) {
shandle = Storage::Get()->Alloc(dbytes, shandle.ctx);
shandle.size = dbytes;
Storage::Get()->Alloc(&shandle);
#if MXNET_USE_MKLDNN == 1
mkl_mem_ = nullptr;
#endif
Expand All @@ -1013,7 +1017,8 @@ class NDArray {
// free storage
Storage::Get()->Free(shandle);
// init storage
shandle = Storage::Get()->Alloc(dbytes, shandle.ctx);
shandle.size = dbytes;
Storage::Get()->Alloc(&shandle);
#if MXNET_USE_MKLDNN == 1
mkl_mem_ = nullptr;
#endif
Expand Down
70 changes: 50 additions & 20 deletions include/mxnet/resource.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#define MXNET_RESOURCE_H_

#include <dmlc/logging.h>
#include <string>
#include "./base.h"
#include "./engine.h"
#include "./random_generator.h"
Expand Down Expand Up @@ -62,6 +63,28 @@ struct ResourceRequest {
: type(type) {}
};

namespace {
/// \brief Given a path, extract the filename.
inline std::string __extract_fname(const std::string& path) {
std::size_t last_dir_pos = path.find_last_of("/\\");
if (last_dir_pos == std::string::npos) {
return path;
}
return path.substr(last_dir_pos + 1);
}
} // anonymous namespace

#if (defined(__GNUC__) || defined(__GNUG__)) && !defined(__clang__)
#define MXNET_RESOURCE_DEFAULT_NAME_FARG(tag) \
std::string(tag) \
+ " (" + __extract_fname(__builtin_FILE()) \
+ " +" + std::to_string(__builtin_LINE()) + ")"
#else // !__GNUC__ || __clang__
#define MXNET_RESOURCE_DEFAULT_NAME_FARG(tag) \
std::string(tag) \
+ " (" + __extract_fname(__FILE__) \
+ " +" + std::to_string(__LINE__) + ")"
#endif // __GNUC__ && !__clang__

/*!
* \brief Resources used by mxnet operations.
Expand Down Expand Up @@ -120,16 +143,18 @@ struct Resource {
* when running on device, so the launched kernels that depend on the temp space
* can finish correctly.
*
* \param shape the Shape of returning tensor.
* \param stream the stream of retruning tensor.
* \param shape the shape of returning tensor.
* \param stream the stream of returning tensor.
* \param name the name of the operator requesting the resource.
* \return the mshadow tensor requested.
* \tparam xpu the device type of random number generator.
* \tparam ndim the number of dimension of the tensor requested.
* \tparam xpu the device type of random number generator.
* \tparam ndim the number of dimension of the tensor requested.
*/
template<typename xpu, int ndim>
inline mshadow::Tensor<xpu, ndim, real_t> get_space(
mshadow::Shape<ndim> shape, mshadow::Stream<xpu> *stream) const {
return get_space_typed<xpu, ndim, real_t>(shape, stream);
mshadow::Shape<ndim> shape, mshadow::Stream<xpu> *stream,
const std::string &name = MXNET_RESOURCE_DEFAULT_NAME_FARG("temp_space")) const {
return get_space_typed<xpu, ndim, real_t>(shape, stream, name);
}
/*!
* \brief Get cpu space requested as mshadow Tensor.
Expand All @@ -148,33 +173,37 @@ struct Resource {
* \brief Get space requested as mshadow Tensor in specified type.
* The caller can request arbitrary size.
*
* \param shape the Shape of returning tensor.
* \param stream the stream of retruning tensor.
* \param shape the shape of returning tensor.
* \param stream the stream of returning tensor.
* \param name the name of the operator requesting the resource.
* \return the mshadow tensor requested.
* \tparam xpu the device type of random number generator.
* \tparam ndim the number of dimension of the tensor requested.
* \tparam xpu the device type of random number generator.
* \tparam ndim the number of dimension of the tensor requested.
*/
template<typename xpu, int ndim, typename DType>
inline mshadow::Tensor<xpu, ndim, DType> get_space_typed(
mshadow::Shape<ndim> shape, mshadow::Stream<xpu> *stream) const {
mshadow::Shape<ndim> shape, mshadow::Stream<xpu> *stream,
const std::string &name = MXNET_RESOURCE_DEFAULT_NAME_FARG("temp_space")) const {
CHECK_EQ(req.type, ResourceRequest::kTempSpace);
return mshadow::Tensor<xpu, ndim, DType>(
reinterpret_cast<DType*>(get_space_internal(shape.Size() * sizeof(DType))),
reinterpret_cast<DType*>(get_space_internal(
shape.Size() * sizeof(DType), name)),
shape, shape[ndim - 1], stream);
}
#if MXNET_USE_CUDNN == 1
/*!
* \brief Get cudnn dropout descriptor from shared state space.
* \brief Get cuDNN dropout descriptor from shared state space.
*
* \param dropout_desc reference to previously created cudnn dropout descriptor.
* \param stream the stream of retruning tensor.
* \param dropout_desc reference to previously created cuDNN dropout descriptor.
* \param stream the stream of returning tensor.
* \param name the name of the operator requesting the resource.
* \return the mshadow tensor requested.
*/
void get_cudnn_dropout_desc(
cudnnDropoutDescriptor_t* dropout_desc,
cudnnDropoutDescriptor_t *dropout_desc,
mshadow::Stream<gpu> *stream,
const float dropout,
uint64_t seed) const;
const float dropout, uint64_t seed,
const std::string &name = MXNET_RESOURCE_DEFAULT_NAME_FARG("cudnn_dropout_state")) const;
#endif // MXNET_USE_CUDNN == 1

/*!
Expand All @@ -195,10 +224,11 @@ struct Resource {
}
/*!
* \brief internal function to get space from resources.
* \param size The size of the space.
* \param size the Size of the space.
* \param name the Name of the operator requesting the resource.
* \return The allocated space.
*/
void* get_space_internal(size_t size) const;
void* get_space_internal(size_t size, const std::string &name) const;
/*!
* \brief internal function to get cpu space from resources.
* \param size The size of space.
Expand Down
11 changes: 10 additions & 1 deletion include/mxnet/storage.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,14 @@
#define MXNET_STORAGE_H_

#include <memory>
#include <string>
#include "./base.h"

namespace mxnet {

#define MXNET_STORAGE_DEFAULT_PROFILER_SCOPE_CSTR "<unk>:"
#define MXNET_STORAGE_DEFAULT_NAME_CSTR "unknown"

/*!
* \brief Storage manager across multiple devices.
*/
Expand All @@ -55,7 +59,12 @@ class Storage {
* \brief Id for IPC shared memory
*/
int shared_pid{-1};
int shared_id{-1};
int shared_id {-1};
/*!
* \brief Attributes for tracking storage allocations.
*/
std::string profiler_scope{MXNET_STORAGE_DEFAULT_PROFILER_SCOPE_CSTR};
std::string name{MXNET_STORAGE_DEFAULT_NAME_CSTR};
};
/*!
* \brief Allocate a new contiguous memory for a given size.
Expand Down
Loading

0 comments on commit d37d7ba

Please sign in to comment.