Implement storage tagging, the first half of the memory profiler

apache · Mar 9, 2020 · d37d7ba · d37d7ba
1 parent 48e9e2c
commit d37d7ba
Show file tree

Hide file tree

Showing 46 changed files with 1,066 additions and 99 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -33,6 +33,7 @@ Format: Auto | Common | All | LIST(ARCH_AND_PTX ...)
 option(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
 option(USE_OPENCV "Build with OpenCV support" ON)
 option(USE_OPENMP "Build with Openmp support" ON)
+cmake_dependent_option(USE_NVML "Build with nvml support if found" ON "USE_CUDA" OFF)
 cmake_dependent_option(USE_CUDNN "Build with cudnn support" ON "USE_CUDA" OFF) # one could set CUDNN_ROOT for search path
 cmake_dependent_option(USE_NVTX "Build with nvtx support if found" ON "USE_CUDA" OFF)
 cmake_dependent_option(USE_SSE "Build with x86 SSE instruction support" ON "NOT ARM" OFF)
@@ -630,6 +631,19 @@ if(USE_CUDA)
   list(APPEND SOURCE ${CUDA})
   add_definitions(-DMXNET_USE_CUDA=1)
 
+  if(UNIX)
+    if(USE_NVML)
+      find_package(NVML)
+      if(NVML_FOUND)
+        include_directories(${NVML_INCLUDE_DIRS})
+        list(APPEND mxnet_LINKER_LIBS ${NVML_LIBRARIES})
+        add_definitions(-DMXNET_USE_NVML=1)
+      else()
+        add_definitions(-DMXNET_USE_NVML=0)
+        message(WARNING "Could not find NVML libraries")
+      endif()
+    endif()
+  endif()
   if(USE_NCCL)
     find_package(NCCL)
     if(NCCL_FOUND)

diff --git a/Makefile b/Makefile
@@ -526,6 +526,12 @@ ifeq ($(USE_CUDA), 1)
 	# Make sure to add stubs as fallback in order to be able to build
 	# without full CUDA install (especially if run without nvidia-docker)
 	LDFLAGS += -L/usr/local/cuda/lib64/stubs
+	ifeq ($(USE_NVML), 1)
+		LDFLAGS += -lnvidia-ml
+		CFLAGS += -DMXNET_USE_NVML=1
+	else
+		CFLAGS += -DMXNET_USE_NVML=0
+	endif
 	ifeq ($(USE_NCCL), 1)
 		ifneq ($(USE_NCCL_PATH), NONE)
 			CFLAGS += -I$(USE_NCCL_PATH)/include
@@ -537,6 +543,7 @@ ifeq ($(USE_CUDA), 1)
 		CFLAGS += -DMXNET_USE_NCCL=0
 	endif
 else
+	CFLAGS += -DMXNET_USE_NVML=0
 	CFLAGS += -DMXNET_USE_NCCL=0
 endif
 

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
@@ -715,7 +715,7 @@ build_ubuntu_gpu_mkldnn() {
     set -ex
     cd /work/build
     cmake \
-        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
+        -DCMAKE_BUILD_TYPE=Release \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_TVM_OP=ON \
         -DUSE_CUDA=ON \
@@ -729,7 +729,7 @@ build_ubuntu_gpu_mkldnn_nocudnn() {
     set -ex
     cd /work/build
     cmake \
-        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
+        -DCMAKE_BUILD_TYPE=Release \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_TVM_OP=ON \
         -DUSE_CUDA=ON \
@@ -744,7 +744,7 @@ build_ubuntu_gpu_cuda101_cudnn7() {
     set -ex
     cd /work/build
     cmake \
-        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
+        -DCMAKE_BUILD_TYPE=Release \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_TVM_OP=ON \
         -DUSE_CUDA=ON \
@@ -800,7 +800,7 @@ build_ubuntu_gpu_cuda101_cudnn7_no_tvm_op() {
     set -ex
     cd /work/build
     cmake \
-        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
+        -DCMAKE_BUILD_TYPE=Release \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_TVM_OP=OFF \
         -DUSE_CUDA=ON \

diff --git a/cmake/Modules/FindNVML.cmake b/cmake/Modules/FindNVML.cmake
@@ -0,0 +1,84 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Find the nvml libraries
+#
+# The following variables are optionally searched for defaults
+#  NVML_ROOT_DIR: Base directory where all NVML components are found
+#  NVML_INCLUDE_DIR: Directory where NVML header is found
+#  NVML_LIB_DIR: Directory where NVML library is found
+#
+# The following are set after configuration is done:
+#  NVML_FOUND
+#  NVML_INCLUDE_DIRS
+#  NVML_LIBRARIES
+#
+# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
+# install NVML in the same location as the CUDA toolkit.
+# See https://github.com/caffe2/caffe2/issues/1601
+
+if ($ENV{NVML_ROOT_DIR})
+  message(WARNING "NVML_ROOT_DIR is deprecated. Please set NVML_ROOT instead.")
+endif()
+
+find_path(NVML_INCLUDE_DIRS
+  NAMES nvml.h
+  HINTS
+  ${NVML_INCLUDE_DIR}
+  ${NVML_ROOT_DIR}
+  ${NVML_ROOT_DIR}/include
+  ${CUDA_TOOLKIT_ROOT_DIR}/include
+  $ENV{NVML_DIR}/include
+  )
+
+find_library(NVML_LIBRARIES
+  NAMES nvidia-ml
+  HINTS
+  ${NVML_LIB_DIR}
+  ${NVML_ROOT_DIR}
+  ${NVML_ROOT_DIR}/lib
+  ${NVML_ROOT_DIR}/lib/x86_64-linux-gnu
+  ${NVML_ROOT_DIR}/lib64
+  ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs
+  $ENV{NVML_DIR}/lib
+  )
+
+# if not found in any of the above paths, finally, check in the /usr/local/cuda for UNIX systems
+if (UNIX)
+  set (search_paths "/usr/local/cuda")
+
+  find_path(NVML_INCLUDE_DIRS
+    NAMES nvml.h
+    PATHS ${search_paths}
+    PATH_SUFFIXES include
+  )
+
+  find_library(NVML_LIBRARIES
+    NAMES nvidia-ml
+    PATHS ${search_paths}
+    PATH_SUFFIXES lib64/stubs
+  )
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NVML DEFAULT_MSG NVML_INCLUDE_DIRS NVML_LIBRARIES)
+
+if(NVML_FOUND)
+  message(STATUS "Found NVML (include: ${NVML_INCLUDE_DIRS}, library: ${NVML_LIBRARIES})")
+  mark_as_advanced(NVML_ROOT_DIR NVML_INCLUDE_DIRS NVML_LIBRARIES)
+endif()
+
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
@@ -313,6 +313,13 @@ MXNET_DLL int MXSetProcessProfilerState(int state, int profile_process,
  */
 MXNET_DLL int MXSetProfilerState(int state);
 
+/*!
+ * \brief Set the scope of profiler for current process
+ * \param scope indicate the working scope of profiler
+ * \return 0 when success, -1 when failure happens.
+ */
+MXNET_DLL int MXSetProfilerScope(const char* scope);
+
 /*!
  * \brief Save profile and stop profiler
  * \param finished true if stat output should stop after this point

diff --git a/include/mxnet/libinfo.h b/include/mxnet/libinfo.h
@@ -55,6 +55,10 @@
 #define MXNET_USE_CUDNN MSHADOW_USE_CUDNN
 #endif
 
+#ifndef MXNET_USE_NVML
+#define MXNET_USE_NVML 0
+#endif
+
 #ifndef MXNET_USE_NCCL
 #define MXNET_USE_NCCL 0
 #endif

diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
@@ -361,6 +361,9 @@ class NDArray {
     CheckAndAlloc();
     return ptr_->shandle;
   }
+  /*! \brief assign profiler scope and name to the storage handles */
+  void AssignStorageInfo(const std::string& profiler_scope,
+                         const std::string& name);
   /*!
    * \brief Block until all the pending write operations with respect
    *    to current NDArray are finished, and read can be performed.
@@ -989,7 +992,7 @@ class NDArray {
     /*! \brief check if delay alloc is on, do alloc if not yet done */
     inline void CheckAndAlloc(void) {
       if (delay_alloc) {
-        shandle = Storage::Get()->Alloc(shandle.size, shandle.ctx);
+        Storage::Get()->Alloc(&shandle);
 #if MXNET_USE_MKLDNN == 1
         mkl_mem_ = nullptr;
 #endif
@@ -1004,7 +1007,8 @@ class NDArray {
           << "CheckAndAlloc(dbytes) is only intended for kDefaultStorage";
       dbytes = std::max(dbytes, static_cast<uint64_t>(shandle.size));
       if (delay_alloc) {
-        shandle = Storage::Get()->Alloc(dbytes, shandle.ctx);
+        shandle.size = dbytes;
+        Storage::Get()->Alloc(&shandle);
 #if MXNET_USE_MKLDNN == 1
         mkl_mem_ = nullptr;
 #endif
@@ -1013,7 +1017,8 @@ class NDArray {
         // free storage
         Storage::Get()->Free(shandle);
         // init storage
-        shandle = Storage::Get()->Alloc(dbytes, shandle.ctx);
+        shandle.size = dbytes;
+        Storage::Get()->Alloc(&shandle);
 #if MXNET_USE_MKLDNN == 1
         mkl_mem_ = nullptr;
 #endif

diff --git a/include/mxnet/resource.h b/include/mxnet/resource.h
@@ -26,6 +26,7 @@
 #define MXNET_RESOURCE_H_
 
 #include <dmlc/logging.h>
+#include <string>
 #include "./base.h"
 #include "./engine.h"
 #include "./random_generator.h"
@@ -62,6 +63,28 @@ struct ResourceRequest {
       : type(type) {}
 };
 
+namespace {
+/// \brief Given a path, extract the filename.
+inline std::string __extract_fname(const std::string& path) {
+  std::size_t last_dir_pos = path.find_last_of("/\\");
+  if (last_dir_pos == std::string::npos) {
+    return path;
+  }
+  return path.substr(last_dir_pos + 1);
+}
+}  // anonymous namespace
+
+#if (defined(__GNUC__) || defined(__GNUG__)) && !defined(__clang__)
+#define MXNET_RESOURCE_DEFAULT_NAME_FARG(tag) \
+    std::string(tag) \
+    + " (" + __extract_fname(__builtin_FILE()) \
+    + " +" +  std::to_string(__builtin_LINE()) + ")"
+#else  // !__GNUC__ || __clang__
+#define MXNET_RESOURCE_DEFAULT_NAME_FARG(tag) \
+    std::string(tag) \
+    + " (" + __extract_fname(__FILE__) \
+    + " +" +  std::to_string(__LINE__) + ")"
+#endif  // __GNUC__ && !__clang__
 
 /*!
  * \brief Resources used by mxnet operations.
@@ -120,16 +143,18 @@ struct Resource {
    *  when running on device, so the launched kernels that depend on the temp space
    *  can finish correctly.
    *
-   * \param shape the Shape of returning tensor.
-   * \param stream the stream of retruning tensor.
+   * \param shape   the shape of returning tensor.
+   * \param stream  the stream of returning tensor.
+   * \param name    the name of the operator requesting the resource.
    * \return the mshadow tensor requested.
-   * \tparam xpu the device type of random number generator.
-   * \tparam ndim the number of dimension of the tensor requested.
+   * \tparam xpu   the device type of random number generator.
+   * \tparam ndim  the number of dimension of the tensor requested.
    */
   template<typename xpu, int ndim>
   inline mshadow::Tensor<xpu, ndim, real_t> get_space(
-      mshadow::Shape<ndim> shape, mshadow::Stream<xpu> *stream) const {
-    return get_space_typed<xpu, ndim, real_t>(shape, stream);
+      mshadow::Shape<ndim> shape, mshadow::Stream<xpu> *stream,
+      const std::string &name = MXNET_RESOURCE_DEFAULT_NAME_FARG("temp_space")) const {
+    return get_space_typed<xpu, ndim, real_t>(shape, stream, name);
   }
   /*!
    * \brief Get cpu space requested as mshadow Tensor.
@@ -148,33 +173,37 @@ struct Resource {
    * \brief Get space requested as mshadow Tensor in specified type.
    *  The caller can request arbitrary size.
    *
-   * \param shape the Shape of returning tensor.
-   * \param stream the stream of retruning tensor.
+   * \param shape   the shape of returning tensor.
+   * \param stream  the stream of returning tensor.
+   * \param name    the name of the operator requesting the resource.
    * \return the mshadow tensor requested.
-   * \tparam xpu the device type of random number generator.
-   * \tparam ndim the number of dimension of the tensor requested.
+   * \tparam xpu   the device type of random number generator.
+   * \tparam ndim  the number of dimension of the tensor requested.
    */
   template<typename xpu, int ndim, typename DType>
   inline mshadow::Tensor<xpu, ndim, DType> get_space_typed(
-      mshadow::Shape<ndim> shape, mshadow::Stream<xpu> *stream) const {
+      mshadow::Shape<ndim> shape, mshadow::Stream<xpu> *stream,
+      const std::string &name = MXNET_RESOURCE_DEFAULT_NAME_FARG("temp_space")) const {
     CHECK_EQ(req.type, ResourceRequest::kTempSpace);
     return mshadow::Tensor<xpu, ndim, DType>(
-        reinterpret_cast<DType*>(get_space_internal(shape.Size() * sizeof(DType))),
+        reinterpret_cast<DType*>(get_space_internal(
+          shape.Size() * sizeof(DType), name)),
         shape, shape[ndim - 1], stream);
   }
 #if MXNET_USE_CUDNN == 1
   /*!
-   * \brief Get cudnn dropout descriptor from shared state space.
+   * \brief Get cuDNN dropout descriptor from shared state space.
    *
-   * \param dropout_desc reference to previously created cudnn dropout descriptor.
-   * \param stream the stream of retruning tensor.
+   * \param dropout_desc  reference to previously created cuDNN dropout descriptor.
+   * \param stream  the stream of returning tensor.
+   * \param name    the name of the operator requesting the resource.
    * \return the mshadow tensor requested.
    */
   void get_cudnn_dropout_desc(
-      cudnnDropoutDescriptor_t* dropout_desc,
+      cudnnDropoutDescriptor_t *dropout_desc,
       mshadow::Stream<gpu> *stream,
-      const float dropout,
-      uint64_t seed) const;
+      const float dropout, uint64_t seed,
+      const std::string &name = MXNET_RESOURCE_DEFAULT_NAME_FARG("cudnn_dropout_state")) const;
 #endif  // MXNET_USE_CUDNN == 1
 
   /*!
@@ -195,10 +224,11 @@ struct Resource {
   }
   /*!
    * \brief internal function to get space from resources.
-   * \param size The size of the space.
+   * \param size the Size of the space.
+   * \param name the Name of the operator requesting the resource.
    * \return The allocated space.
    */
-  void* get_space_internal(size_t size) const;
+  void* get_space_internal(size_t size, const std::string &name) const;
   /*!
    * \brief internal function to get cpu space from resources.
    * \param size The size of space.

diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h
@@ -26,10 +26,14 @@
 #define MXNET_STORAGE_H_
 
 #include <memory>
+#include <string>
 #include "./base.h"
 
 namespace mxnet {
 
+#define MXNET_STORAGE_DEFAULT_PROFILER_SCOPE_CSTR  "<unk>:"
+#define MXNET_STORAGE_DEFAULT_NAME_CSTR  "unknown"
+
 /*!
  * \brief Storage manager across multiple devices.
  */
@@ -55,7 +59,12 @@ class Storage {
      * \brief Id for IPC shared memory
      */
     int shared_pid{-1};
-    int shared_id{-1};
+    int shared_id {-1};
+    /*!
+     * \brief Attributes for tracking storage allocations.
+     */
+    std::string profiler_scope{MXNET_STORAGE_DEFAULT_PROFILER_SCOPE_CSTR};
+    std::string name{MXNET_STORAGE_DEFAULT_NAME_CSTR};
   };
   /*!
    * \brief Allocate a new contiguous memory for a given size.