From 431eb5c618205f0c5ee859f08ecd6c4ff1b06e1b Mon Sep 17 00:00:00 2001
From: Chris Olivier <cjolivier01@gmail.com>
Date: Sun, 1 Apr 2018 16:19:45 -0700
Subject: [PATCH] [MXNET-247] Always build profiler (#10308)

* Always build profiler

* Update naive_engine.cc

* remove PROFILE_MESSAGE macro

* Remove USE_PROFILER=1 from CI runs
---
 CMakeLists.txt                      |  4 --
 Makefile                            |  5 --
 ci/docker/runtime_functions.sh      | 39 +++++--------
 include/mxnet/base.h                | 11 +---
 make/config.mk                      |  3 -
 src/c_api/c_api_profile.cc          | 90 -----------------------------
 src/common/rtc.cc                   |  2 +-
 src/engine/naive_engine.cc          | 13 +----
 src/engine/threaded_engine.cc       | 12 +---
 src/engine/threaded_engine.h        |  6 --
 src/executor/graph_executor.cc      | 37 +++---------
 src/executor/graph_executor.h       |  3 +
 src/imperative/imperative_utils.h   | 10 ++--
 src/io/image_io.cc                  |  4 +-
 src/kvstore/comm.h                  |  8 +--
 src/kvstore/gradient_compression.cc |  8 +--
 src/kvstore/kvstore_dist.h          | 10 ++--
 src/kvstore/kvstore_local.h         |  2 +-
 src/kvstore/kvstore_nccl.h          |  6 +-
 src/ndarray/ndarray.cc              | 32 +++++-----
 src/operator/custom/custom-inl.h    |  2 +-
 src/operator/custom/ndarray_op.cc   |  4 +-
 src/operator/operator_util.cc       |  6 +-
 src/optimizer/sgd-inl.h             |  8 +--
 src/profiler/profiler.cc            |  4 --
 src/profiler/storage_profiler.h     |  3 -
 src/resource.cc                     |  6 +-
 src/storage/storage.cc              |  8 ---
 28 files changed, 84 insertions(+), 262 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 116de37fb857..db14dadf80f8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -653,10 +653,6 @@ if(MSVC AND USE_MXNET_LIB_NAMING)
   set_target_properties(mxnet PROPERTIES OUTPUT_NAME "libmxnet")
 endif()
 
-if(USE_PROFILER)
-    add_definitions(-DMXNET_USE_PROFILER)
-endif()
-
 add_subdirectory(tests)
 
 include(GNUInstallDirs)
diff --git a/Makefile b/Makefile
index dba649f73112..ae57114f4905 100644
--- a/Makefile
+++ b/Makefile
@@ -100,11 +100,6 @@ else
 	NVCCFLAGS += -std=c++11 -Xcompiler -D_FORCE_INLINES -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 endif
 
-# CFLAGS for profiler
-ifeq ($(USE_PROFILER), 1)
-	CFLAGS += -DMXNET_USE_PROFILER=1
-endif
-
 # CFLAGS for segfault logger
 ifeq ($(USE_SIGNAL_HANDLER), 1)
 	CFLAGS += -DMXNET_USE_SIGNAL_HANDLER=1
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index f35de6bef0b9..d6f07da942a5 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -179,7 +179,6 @@ build_centos7_cpu() {
         DEV=1 \
         USE_LAPACK=1 \
         USE_LAPACK_PATH=/usr/lib64/liblapack.so \
-        USE_PROFILER=1 \
         USE_BLAS=openblas \
         -j$(nproc)
 }
@@ -191,7 +190,6 @@ build_centos7_mkldnn() {
         DEV=1 \
         USE_LAPACK=1 \
         USE_LAPACK_PATH=/usr/lib64/liblapack.so \
-        USE_PROFILER=1 \
         USE_MKLDNN=1 \
         USE_BLAS=openblas \
         -j$(nproc)
@@ -204,7 +202,6 @@ build_centos7_gpu() {
         DEV=1 \
         USE_LAPACK=1 \
         USE_LAPACK_PATH=/usr/lib64/liblapack.so \
-        USE_PROFILER=1 \
         USE_BLAS=openblas \
         USE_CUDA=1 \
         USE_CUDA_PATH=/usr/local/cuda \
@@ -216,7 +213,6 @@ build_ubuntu_cpu_openblas() {
     set -ex
     make \
         DEV=1                         \
-        USE_PROFILER=1                \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
         -j$(nproc)
@@ -225,7 +221,6 @@ build_ubuntu_cpu_openblas() {
 build_ubuntu_cpu_clang39() {
     set -ex
     make \
-        USE_PROFILER=1                \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
         USE_OPENMP=0                  \
@@ -237,7 +232,6 @@ build_ubuntu_cpu_clang39() {
 build_ubuntu_cpu_clang50() {
     set -ex
     make \
-        USE_PROFILER=1                \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
         USE_OPENMP=1                  \
@@ -249,7 +243,6 @@ build_ubuntu_cpu_clang50() {
 build_ubuntu_cpu_clang39_mkldnn() {
     set -ex
     make \
-        USE_PROFILER=1                \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
         USE_MKLDNN=1                  \
@@ -262,7 +255,6 @@ build_ubuntu_cpu_clang39_mkldnn() {
 build_ubuntu_cpu_clang50_mkldnn() {
     set -ex
     make \
-        USE_PROFILER=1                \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
         USE_MKLDNN=1                  \
@@ -276,7 +268,6 @@ build_ubuntu_cpu_mkldnn() {
     set -ex
     make  \
         DEV=1                         \
-        USE_PROFILER=1                \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
         USE_MKLDNN=1                  \
@@ -287,7 +278,6 @@ build_ubuntu_gpu_mkldnn() {
     set -ex
     make  \
         DEV=1                         \
-        USE_PROFILER=1                \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
         USE_MKLDNN=1                  \
@@ -301,7 +291,6 @@ build_ubuntu_gpu_cuda91_cudnn7() {
     set -ex
     make  \
         DEV=1                         \
-        USE_PROFILER=1                \
         USE_BLAS=openblas             \
         USE_CUDA=1                    \
         USE_CUDA_PATH=/usr/local/cuda \
@@ -314,7 +303,7 @@ build_ubuntu_amalgamation() {
     set -ex
     # Amalgamation can not be run with -j nproc
     make -C amalgamation/ clean
-    make -C amalgamation/ USE_BLAS=openblas    
+    make -C amalgamation/ USE_BLAS=openblas
 }
 
 build_ubuntu_amalgamation_min() {
@@ -335,7 +324,7 @@ build_ubuntu_gpu_cmake_mkldnn() {
         -DCMAKE_BUILD_TYPE=Release \
         -G Ninja                   \
         /work/mxnet
-    
+
     ninja -v
 }
 
@@ -350,7 +339,7 @@ build_ubuntu_gpu_cmake() {
         -DCMAKE_BUILD_TYPE=Release \
         -G Ninja                   \
         /work/mxnet
-    
+
     ninja -v
 }
 
@@ -367,7 +356,7 @@ sanity_check() {
 
 unittest_ubuntu_python2_cpu() {
     set -ex
-    export PYTHONPATH=./python/ 
+    export PYTHONPATH=./python/
     # MXNET_MKLDNN_DEBUG is buggy and produces false positives
     # https://github.com/apache/incubator-mxnet/issues/10026
     #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
@@ -379,7 +368,7 @@ unittest_ubuntu_python2_cpu() {
 
 unittest_ubuntu_python3_cpu() {
     set -ex
-    export PYTHONPATH=./python/ 
+    export PYTHONPATH=./python/
     # MXNET_MKLDNN_DEBUG is buggy and produces false positives
     # https://github.com/apache/incubator-mxnet/issues/10026
     #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
@@ -390,9 +379,9 @@ unittest_ubuntu_python3_cpu() {
 
 unittest_ubuntu_python2_gpu() {
     set -ex
-    export PYTHONPATH=./python/ 
+    export PYTHONPATH=./python/
     # MXNET_MKLDNN_DEBUG is buggy and produces false positives
-    # https://github.com/apache/incubator-mxnet/issues/10026    
+    # https://github.com/apache/incubator-mxnet/issues/10026
     #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     nosetests-2.7 --verbose tests/python/gpu
@@ -400,7 +389,7 @@ unittest_ubuntu_python2_gpu() {
 
 unittest_ubuntu_python3_gpu() {
     set -ex
-    export PYTHONPATH=./python/ 
+    export PYTHONPATH=./python/
     # MXNET_MKLDNN_DEBUG is buggy and produces false positives
     # https://github.com/apache/incubator-mxnet/issues/10026
     #export MXNET_MKLDNN_DEBUG=1 # Ignored if not present
@@ -412,9 +401,9 @@ unittest_ubuntu_python3_gpu() {
 # need to separte it from unittest_ubuntu_python2_gpu()
 unittest_ubuntu_python2_quantization_gpu() {
     set -ex
-    export PYTHONPATH=./python/ 
+    export PYTHONPATH=./python/
     # MXNET_MKLDNN_DEBUG is buggy and produces false positives
-    # https://github.com/apache/incubator-mxnet/issues/10026    
+    # https://github.com/apache/incubator-mxnet/issues/10026
     #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     nosetests-2.7 --verbose tests/python/quantization_gpu
@@ -424,7 +413,7 @@ unittest_ubuntu_python2_quantization_gpu() {
 # need to separte it from unittest_ubuntu_python3_gpu()
 unittest_ubuntu_python3_quantization_gpu() {
     set -ex
-    export PYTHONPATH=./python/ 
+    export PYTHONPATH=./python/
     # MXNET_MKLDNN_DEBUG is buggy and produces false positives
     # https://github.com/apache/incubator-mxnet/issues/10026
     #export MXNET_MKLDNN_DEBUG=1 # Ignored if not present
@@ -479,7 +468,7 @@ unittest_centos7_gpu() {
     python3.6 -m "nose" --with-timer --verbose tests/python/gpu
 }
 
-integrationtest_ubuntu_cpu_onnx() { 
+integrationtest_ubuntu_cpu_onnx() {
 	set -ex
 	export PYTHONPATH=./python/
 	python example/onnx/super_resolution.py
@@ -496,7 +485,7 @@ integrationtest_ubuntu_gpu_python() {
 
 integrationtest_ubuntu_gpu_caffe() {
     set -ex
-    export PYTHONPATH=/work/deps/caffe/python:./python 
+    export PYTHONPATH=/work/deps/caffe/python:./python
     python tools/caffe_converter/test_converter.py
 }
 
@@ -545,7 +534,7 @@ test_ubuntu_cpu_python3() {
 deploy_docs() {
     set -ex
     pushd .
-    
+
     make docs
 
     popd
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index c0eb97aa0b3f..783002e6fa48 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -99,15 +99,6 @@
 #define MXNET_PREDICT_ONLY 0
 #endif
 
-/*!
- * \brief define operator message for profiler
- */
-#if MXNET_USE_PROFILER
-#define PROFILER_MESSAGE(msg)     msg
-#else
-#define PROFILER_MESSAGE(msg)     nullptr
-#endif
-
 /*! \brief major version */
 #define MXNET_MAJOR 1
 /*! \brief minor version */
@@ -121,7 +112,7 @@
 /*!
  * \brief define function name as profiler message
  */
-#define PROFILER_MESSAGE_FUNCNAME PROFILER_MESSAGE(__FUNCTION__)
+#define PROFILER_MESSAGE_FUNCNAME (__FUNCTION__)
 
 /*! \brief namespace of mxnet */
 namespace mxnet {
diff --git a/make/config.mk b/make/config.mk
index fa429f31f298..9eded6f50807 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -47,9 +47,6 @@ DEV = 0
 # whether compile with debug
 DEBUG = 0
 
-# whether compile with profiler
-USE_PROFILER =
-
 # whether to turn on segfault signal handler to log the stack trace
 USE_SIGNAL_HANDLER =
 
diff --git a/src/c_api/c_api_profile.cc b/src/c_api/c_api_profile.cc
index 16b47ad20b47..c946e3b6bd50 100644
--- a/src/c_api/c_api_profile.cc
+++ b/src/c_api/c_api_profile.cc
@@ -37,7 +37,6 @@ namespace mxnet {
 
 // #define PROFILE_API_INCLUDE_AS_EVENT
 
-#if MXNET_USE_PROFILER
 static profiler::ProfileDomain api_domain("MXNET_C_API");
 static profiler::ProfileCounter api_call_counter("MXNet C API Calls", &api_domain);
 static profiler::ProfileCounter api_concurrency_counter("MXNet C API Concurrency",
@@ -114,10 +113,8 @@ class ProfilingThreadData {
 };
 
 static thread_local ProfilingThreadData thread_profiling_data;
-#endif  // MXNET_USE_PROFILER
 
 extern void on_enter_api(const char *function) {
-#if MXNET_USE_PROFILER
   if (profiler::Profiler::Get()->IsProfiling(profiler::Profiler::kAPI)) {
     if (!thread_profiling_data.ignore_call_) {
       ++api_call_counter;
@@ -136,10 +133,8 @@ extern void on_enter_api(const char *function) {
 #endif  // PROFILE_API_INCLUDE_AS_EVENT
     }
   }
-#endif  // MXNET_USE_PROFILER
 }
 extern void on_exit_api() {
-#if MXNET_USE_PROFILER
   if (profiler::Profiler::Get()->IsProfiling(profiler::Profiler::kAPI)) {
     if (!thread_profiling_data.ignore_call_) {
       CHECK(!thread_profiling_data.calls_.empty());
@@ -152,7 +147,6 @@ extern void on_exit_api() {
       --api_concurrency_counter;
     }
   }
-#endif  // MXNET_USE_PROFILER
 }
 
 /*!
@@ -160,16 +154,12 @@ extern void on_exit_api() {
  */
 struct IgnoreProfileCallScope {
   IgnoreProfileCallScope()  {
-#if MXNET_USE_PROFILER
     DCHECK_EQ(thread_profiling_data.ignore_call_, false);
     thread_profiling_data.ignore_call_ = true;
-#endif  // MXNET_USE_PROFILER
   }
   ~IgnoreProfileCallScope() {
-#if MXNET_USE_PROFILER
     DCHECK_EQ(thread_profiling_data.ignore_call_, true);
     thread_profiling_data.ignore_call_ = false;
-#endif  // MXNET_USE_PROFILER
   }
 };
 
@@ -203,16 +193,6 @@ struct PythonProfileObjects {
 };
 static PythonProfileObjects python_profile_objects;
 
-#if !defined(MXNET_USE_PROFILER) || !MXNET_USE_PROFILER
-static void warn_not_built_with_profiler_enabled() {
-  static volatile bool warned_not_built_with_profiler_enabled = false;
-  if (!warned_not_built_with_profiler_enabled) {
-    warned_not_built_with_profiler_enabled = true;
-    LOG(WARNING) << "Need to compile with USE_PROFILER=1 for MXNet Profiling";
-  }
-}
-#endif  // MXNET_USE_PROFILER
-
 struct ProfileConfigParam : public dmlc::Parameter<ProfileConfigParam> {
   bool profile_all;
   bool profile_symbolic;
@@ -267,7 +247,6 @@ DMLC_REGISTER_PARAMETER(ProfileMarkerScopeParam);
 int MXSetProfilerConfig(int num_params, const char* const* keys, const char* const* vals) {
     mxnet::IgnoreProfileCallScope ignore;
   API_BEGIN();
-#if MXNET_USE_PROFILER
     std::vector<std::pair<std::string, std::string>> kwargs;
     kwargs.reserve(num_params);
     for (int i = 0; i < num_params; ++i) {
@@ -287,9 +266,6 @@ int MXSetProfilerConfig(int num_params, const char* const* keys, const char* con
                                          param.continuous_dump,
                                          param.dump_period,
                                          param.aggregate_stats);
-#else
-    warn_not_built_with_profiler_enabled();
-#endif
   API_END();
 }
 
@@ -297,7 +273,6 @@ int MXAggregateProfileStatsPrint(const char **out_str, int reset) {
   MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
   API_BEGIN();
     CHECK_NOTNULL(out_str);
-#if MXNET_USE_PROFILER
     profiler::Profiler *profiler = profiler::Profiler::Get();
     if (profiler->IsEnableOutput()) {
       // Register stats up until now
@@ -309,10 +284,6 @@ int MXAggregateProfileStatsPrint(const char **out_str, int reset) {
       stats->Dump(os, reset != 0);
     }
     ret->ret_str = os.str();
-#else
-    warn_not_built_with_profiler_enabled();
-    ret->ret_str.clear();
-#endif
     *out_str = (ret->ret_str).c_str();
   API_END();
 }
@@ -320,14 +291,10 @@ int MXAggregateProfileStatsPrint(const char **out_str, int reset) {
 int MXDumpProfile(int finished) {
   mxnet::IgnoreProfileCallScope ignore;
   API_BEGIN();
-#if MXNET_USE_PROFILER
     profiler::Profiler *profiler = profiler::Profiler::Get();
     CHECK(profiler->IsEnableOutput())
       << "Profiler hasn't been run. Config and start profiler first";
     profiler->DumpProfile(finished != 0);
-#else
-    warn_not_built_with_profiler_enabled();
-#endif
   API_END()
 }
 
@@ -335,7 +302,6 @@ int MXSetProfilerState(int state) {
   mxnet::IgnoreProfileCallScope ignore;
   // state, kNotRunning: 0, kRunning: 1
   API_BEGIN();
-#if MXNET_USE_PROFILER
     switch (state) {
       case profiler::Profiler::kNotRunning:
         profiler::vtune::vtune_pause();
@@ -345,26 +311,18 @@ int MXSetProfilerState(int state) {
         break;
     }
     profiler::Profiler::Get()->SetState(profiler::Profiler::ProfilerState(state));
-#else
-    warn_not_built_with_profiler_enabled();
-#endif
   API_END();
 }
 
 int MXProfileCreateDomain(const char *domain, ProfileHandle *out) {
   mxnet::IgnoreProfileCallScope ignore;
   API_BEGIN();
-#if MXNET_USE_PROFILER
     auto dom = std::make_shared<profiler::ProfileDomain>(domain);
     {
       std::unique_lock<std::mutex> lock(python_profile_objects.cs_domains_);
       python_profile_objects.domains_.push_back(dom);
     }
     *out = dom.get();
-#else
-    warn_not_built_with_profiler_enabled();
-    *out = nullptr;
-#endif
   API_END();
 }
 
@@ -373,7 +331,6 @@ int MXProfileCreateTask(ProfileHandle domain,
                         ProfileHandle *out) {
   mxnet::IgnoreProfileCallScope ignore;
   API_BEGIN();
-#if MXNET_USE_PROFILER
     auto ctr =
       std::make_shared<profiler::ProfileTask>(task_name,
                                                 static_cast<profiler::ProfileDomain *>(domain));
@@ -382,10 +339,6 @@ int MXProfileCreateTask(ProfileHandle domain,
       python_profile_objects.tasks_.emplace(std::make_pair(ctr.get(), ctr));
     }
     *out = ctr.get();
-#else
-    warn_not_built_with_profiler_enabled();
-    *out = nullptr;
-#endif
   API_END();
 }
 
@@ -394,7 +347,6 @@ int MXProfileCreateFrame(ProfileHandle domain,
                          ProfileHandle *out) {
   mxnet::IgnoreProfileCallScope ignore;
   API_BEGIN();
-#if MXNET_USE_PROFILER
     auto ctr =
       std::make_shared<profiler::ProfileFrame>(frame_name,
                                               static_cast<profiler::ProfileDomain *>(domain));
@@ -403,17 +355,12 @@ int MXProfileCreateFrame(ProfileHandle domain,
       python_profile_objects.frames_.emplace(std::make_pair(ctr.get(), ctr));
     }
     *out = ctr.get();
-#else
-    warn_not_built_with_profiler_enabled();
-    *out = nullptr;
-#endif
   API_END();
 }
 
 int MXProfileCreateEvent(const char *event_name, ProfileHandle *out) {
   mxnet::IgnoreProfileCallScope ignore;
   API_BEGIN();
-#if MXNET_USE_PROFILER
     auto ctr =
       std::make_shared<profiler::ProfileEvent>(event_name);
     {
@@ -421,17 +368,12 @@ int MXProfileCreateEvent(const char *event_name, ProfileHandle *out) {
       python_profile_objects.events_.emplace(std::make_pair(ctr.get(), ctr));
     }
     *out = ctr.get();
-#else
-    warn_not_built_with_profiler_enabled();
-    *out = nullptr;
-#endif
   API_END();
 }
 
 int MXProfileDestroyHandle(ProfileHandle object_handle) {
   mxnet::IgnoreProfileCallScope ignore;
   API_BEGIN();
-#if MXNET_USE_PROFILER
     CHECK_NE(object_handle, static_cast<ProfileHandle>(nullptr))
       << "Invalid NULL handle passed to MXProfileDestroyHandle";
     std::shared_ptr<profiler::ProfileObject> shared_object_ptr(nullptr);
@@ -484,40 +426,28 @@ int MXProfileDestroyHandle(ProfileHandle object_handle) {
       }
     }
     shared_object_ptr.reset();  // Destroy out of lock scope
-#else
-    warn_not_built_with_profiler_enabled();
-#endif
   API_END();
 }
 
 int MXProfileDurationStart(ProfileHandle duration_handle) {
   mxnet::IgnoreProfileCallScope ignore;
   API_BEGIN();
-#if MXNET_USE_PROFILER
     CHECK_NOTNULL(duration_handle);
     static_cast<profiler::ProfileDuration *>(duration_handle)->start();
-#else
-    warn_not_built_with_profiler_enabled();
-#endif
   API_END();
 }
 
 int MXProfileDurationStop(ProfileHandle duration_handle) {
   mxnet::IgnoreProfileCallScope ignore;
   API_BEGIN();
-#if MXNET_USE_PROFILER
     CHECK_NOTNULL(duration_handle);
     static_cast<profiler::ProfileDuration *>(duration_handle)->stop();
-#else
-    warn_not_built_with_profiler_enabled();
-#endif
   API_END();
 }
 
 int MXProfilePause(int paused) {
   mxnet::IgnoreProfileCallScope ignore;
   API_BEGIN();
-#if MXNET_USE_PROFILER
     if (paused) {
       profiler::vtune::vtune_pause();
       profiler::Profiler::Get()->set_paused(true);
@@ -525,9 +455,6 @@ int MXProfilePause(int paused) {
       profiler::Profiler::Get()->set_paused(false);
       profiler::vtune::vtune_resume();
     }
-#else
-    warn_not_built_with_profiler_enabled();
-#endif
   API_END();
 }
 
@@ -536,7 +463,6 @@ int MXProfileCreateCounter(ProfileHandle domain,
                            ProfileHandle *out) {
   mxnet::IgnoreProfileCallScope ignore;
   API_BEGIN();
-#if MXNET_USE_PROFILER
     auto ctr =
       std::make_shared<profiler::ProfileCounter>(counter_name,
                                                 static_cast<profiler::ProfileDomain *>(domain));
@@ -545,32 +471,20 @@ int MXProfileCreateCounter(ProfileHandle domain,
       python_profile_objects.counters_.emplace(std::make_pair(ctr.get(), ctr));
     }
     *out = ctr.get();
-#else
-    warn_not_built_with_profiler_enabled();
-    *out = nullptr;
-#endif
   API_END();
 }
 
 int MXProfileSetCounter(ProfileHandle counter_handle, uint64_t value) {
   mxnet::IgnoreProfileCallScope ignore;
   API_BEGIN();
-#if MXNET_USE_PROFILER
     static_cast<profiler::ProfileCounter *>(counter_handle)->operator=(value);
-#else
-    warn_not_built_with_profiler_enabled();
-#endif
   API_END();
 }
 
 int MXProfileAdjustCounter(ProfileHandle counter_handle, int64_t by_value) {
   mxnet::IgnoreProfileCallScope ignore;
   API_BEGIN();
-#if MXNET_USE_PROFILER
     static_cast<profiler::ProfileCounter *>(counter_handle)->operator+=(by_value);
-#else
-    warn_not_built_with_profiler_enabled();
-#endif
   API_END();
 }
 
@@ -579,7 +493,6 @@ int MXProfileSetMarker(ProfileHandle domain,
                        const char *scope) {
   mxnet::IgnoreProfileCallScope ignore;
   API_BEGIN();
-#if MXNET_USE_PROFILER
     ProfileMarkerScopeParam param;
     std::vector<std::pair<std::string, std::string>> kwargs = {{ "scope", scope }};
     param.Init(kwargs);
@@ -588,8 +501,5 @@ int MXProfileSetMarker(ProfileHandle domain,
                                          static_cast<profiler::ProfileMarker::MarkerScope>(
                                            param.scope));
     marker.mark();
-#else
-    warn_not_built_with_profiler_enabled();
-#endif
   API_END();
 }
diff --git a/src/common/rtc.cc b/src/common/rtc.cc
index 444553b128b0..da083c9244ca 100644
--- a/src/common/rtc.cc
+++ b/src/common/rtc.cc
@@ -178,7 +178,7 @@ void CudaModule::Kernel::Launch(
         p_args.data(), 0));
     CUDA_CALL(cudaStreamSynchronize(s->stream_));
   }, ctx, read_vars, write_vars, FnProperty::kNormal, 0,
-  PROFILER_MESSAGE(mangled_name_.c_str()));
+  mangled_name_.c_str());
 }
 
 
diff --git a/src/engine/naive_engine.cc b/src/engine/naive_engine.cc
index 6246b7316005..1fa530696b36 100644
--- a/src/engine/naive_engine.cc
+++ b/src/engine/naive_engine.cc
@@ -94,7 +94,6 @@ class NaiveEngine final : public Engine {
     NaiveOpr *opr = op->Cast<NaiveOpr>();
     opr->profiling = profiling && profiler->IsProfiling(profiler::Profiler::kSymbolic);
     this->PushAsync([&](RunContext ctx, CallbackOnComplete on_complete) {
-#if MXNET_USE_PROFILER
         if (opr->profiling) {
           std::unique_ptr<profiler::ProfileOperator::Attributes> attrs;
           if (profiler->AggregateEnabled()) {
@@ -107,16 +106,13 @@ class NaiveEngine final : public Engine {
         if (opr->profiling) {
           opr->opr_profile->stop();
         }
-#else
-        opr->fn(ctx, on_complete);
-#endif
       },
       exec_ctx,
       opr->const_vars,
       opr->mutable_vars,
       opr->prop,
       priority,
-      PROFILER_MESSAGE(opr->opr_name));
+      opr->opr_name);
   }
 
   void PushAsync(AsyncFn exec_fun,
@@ -130,7 +126,6 @@ class NaiveEngine final : public Engine {
     CallbackOnComplete callback = CreateCallback(
         NaiveEngine::OnComplete, nullptr);
     this->req_completed_ = false;
-#if MXNET_USE_PROFILER
     profiler::Profiler *profiler = profiler::Profiler::Get();
     NaiveOpr *opr = nullptr;
     const bool profiling = opr_name && profiler->IsProfiling(profiler::Profiler::kImperative);
@@ -145,7 +140,6 @@ class NaiveEngine final : public Engine {
       opr->opr_profile.reset(new profiler::ProfileOperator(opr->opr_name, attrs.release()));
       opr->opr_profile->start(exec_ctx.dev_type, exec_ctx.dev_id);
     }
-#endif
     if (exec_ctx.dev_mask() == gpu::kDevMask) {
 #if MXNET_USE_CUDA
       size_t dev_id = static_cast<size_t>(exec_ctx.dev_id);
@@ -165,16 +159,14 @@ class NaiveEngine final : public Engine {
     }
     CHECK(this->req_completed_)
         << "NaiveEngine only support synchronize Push so far";
-#if MXNET_USE_PROFILER
     if (profiling) {
       opr->opr_profile->stop();
     }
-#endif
   }
 
   void DeleteVariable(SyncFn delete_fn, Context exec_ctx, VarHandle var) override {
     this->PushSync(delete_fn, exec_ctx, {}, {var},
-                   FnProperty::kNormal, 0, PROFILER_MESSAGE("DeleteVariable"));
+                   FnProperty::kNormal, 0, "DeleteVariable");
   }
 
   void WaitForVar(VarHandle var) override {
@@ -204,7 +196,6 @@ class NaiveEngine final : public Engine {
   std::vector<mshadow::Stream<gpu>*> streams_;
 };  // class NaiveEngine
 
-
 Engine *CreateNaiveEngine() {
   return new NaiveEngine();
 }
diff --git a/src/engine/threaded_engine.cc b/src/engine/threaded_engine.cc
index ca5602bb4823..29100602469d 100644
--- a/src/engine/threaded_engine.cc
+++ b/src/engine/threaded_engine.cc
@@ -274,7 +274,7 @@ void ThreadedEngine::DeleteOperator(OprHandle op) {
       ThreadedOpr::Delete(threaded_opr);
       on_complete();
     }, Context::CPU(), {}, deps, FnProperty::kAsync, 0,
-    PROFILER_MESSAGE("DeleteOperator"));
+    "DeleteOperator");
 }
 
 void ThreadedEngine::Push(OprHandle op, Context exec_ctx, int priority, bool profiling) {
@@ -312,11 +312,7 @@ void ThreadedEngine::PushAsync(AsyncFn fn, Context exec_ctx,
   BulkFlush();
   ThreadedOpr *opr = NewOperator(std::move(fn), const_vars, mutable_vars, prop, opr_name, wait);
   opr->temporary = true;
-#if MXNET_USE_PROFILER
   const bool profiling = profiler_->IsProfiling(profiler::Profiler::kImperative);
-#else
-  const bool profiling = false;
-#endif
   Push(opr, exec_ctx, priority, profiling);
 }
 
@@ -350,7 +346,7 @@ void ThreadedEngine::DeleteVariable(SyncFn delete_fn,
       delete_fn(ctx);
       on_complete();
     }, exec_ctx, {}, {var}, FnProperty::kDeleteVar, 0,
-    PROFILER_MESSAGE("DeleteVariable"));
+    "DeleteVariable");
 }
 
 void ThreadedEngine::WaitForVar(VarHandle var) {
@@ -379,7 +375,7 @@ void ThreadedEngine::WaitForVar(VarHandle var) {
       }
       on_complete();
     }, Context::CPU(), {var}, {}, FnProperty::kNormal, 0,
-    PROFILER_MESSAGE("WaitForVar"), true);
+    "WaitForVar", true);
   {
     std::unique_lock<std::mutex> lock{finished_m_};
     finished_cv_.wait(lock, [this, &done]() {
@@ -463,12 +459,10 @@ void ThreadedEngine::OnCompleteStatic(
     Engine *engine, void *opr_block_) {
   OprBlock *opr_block = static_cast<OprBlock*>(opr_block_);
   ThreadedOpr *threaded_opr = opr_block->opr;
-#if MXNET_USE_PROFILER
   if (opr_block->profiling && threaded_opr->opr_name) {
     // record operator end timestamp
     opr_block->opr_profile->stop();
   }
-#endif
   static_cast<ThreadedEngine*>(engine)->OnComplete(threaded_opr);
   OprBlock::Delete(opr_block);
 }
diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h
index 673fa3bf9ebb..bfb1b1d932ca 100644
--- a/src/engine/threaded_engine.h
+++ b/src/engine/threaded_engine.h
@@ -305,10 +305,8 @@ class ThreadedEngine : public Engine {
     objpool_varblk_ref_ = common::ObjectPool<VersionedVarBlock>::_GetSharedRef();
     objpool_var_ref_    = common::ObjectPool<ThreadedVar>::_GetSharedRef();
 
-#ifdef MXNET_USE_PROFILER
     // Get a ref to the profiler so that it doesn't get killed before us
     profiler::Profiler::Get(&profiler_);
-#endif  // MXNET_USE_PROFILER
   }
   ~ThreadedEngine() {
     {
@@ -336,7 +334,6 @@ class ThreadedEngine : public Engine {
    */
   void ExecuteOprBlock(RunContext run_ctx, OprBlock* opr_block) {
     ThreadedOpr* threaded_opr = opr_block->opr;
-#if MXNET_USE_PROFILER
     if (opr_block->profiling && threaded_opr->opr_name) {
       std::unique_ptr<profiler::ProfileOperator::Attributes> attrs;
       if (profiler_->AggregateEnabled()) {
@@ -347,7 +344,6 @@ class ThreadedEngine : public Engine {
                                                                  attrs.release()));
       opr_block->opr_profile->start(ctx.dev_type, ctx.dev_id);
     }
-#endif
     CallbackOnComplete callback =
         this->CreateCallback(ThreadedEngine::OnCompleteStatic, opr_block);
     const bool debug_info = (engine_info_ && debug_push_opr_ == opr_block);
@@ -539,10 +535,8 @@ class ThreadedEngine : public Engine {
   std::shared_ptr<common::ObjectPool<VersionedVarBlock> > objpool_varblk_ref_;
   std::shared_ptr<common::ObjectPool<ThreadedVar> >       objpool_var_ref_;
 
-#if MXNET_USE_PROFILER
   /*! \brief Hold a ref count ot the profiler */
   std::shared_ptr<profiler::Profiler> profiler_;
-#endif  // MXNET_USE_PROFILER
 
   /*!
    * \brief Disallow copy construction and assignment.
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 32772f2108d1..9108bae17323 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -1235,11 +1235,7 @@ void GraphExecutor::InitCachedOps() {
   for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
     const auto& inode = idx[nid];
     if (inode.source->is_variable()) continue;
-#if MXNET_USE_PROFILER
     op_nodes_[nid].opr_name = inode.source->op()->name.c_str();
-#else
-    op_nodes_[nid].opr_name = nullptr;
-#endif
     if (skip_plus_node.at(nid)) {
       op_nodes_[nid].skip_exec_node = true; continue;
     }
@@ -1309,7 +1305,7 @@ void GraphExecutor::InitCachedOps() {
         exec->Setup();
         on_complete();
       }, Context::CPU(), {}, all_vars, FnProperty::kNormal, 0,
-      PROFILER_MESSAGE("SetupExec"));
+      "SetupExec");
     auto exec_fun = [exec, is_async, is_gpu] (
         RunContext ctx, Engine::CallbackOnComplete on_complete) {
       if (is_async) {
@@ -1332,7 +1328,7 @@ void GraphExecutor::InitCachedOps() {
     // setup the vars
     op_nodes_[nid].cached_opr = Engine::Get()->NewOperator(
         exec_fun, use_vars, mutate_vars, FnProperty::kNormal,
-        PROFILER_MESSAGE(op_nodes_[nid].opr_name));
+        op_nodes_[nid].opr_name);
     op_nodes_[nid].mutate_vars = mutate_vars;
     op_nodes_[nid].use_vars = use_vars;
   }
@@ -1484,11 +1480,7 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
     auto seg_op = cached_seg_opr_[nid];
     // Check segments first
     if (monitor_callback_ == nullptr && seg_op.opr != nullptr && seg_op.topo_end <= topo_end) {
-#if MXNET_USE_PROFILER
       bool profiling = profiler::Profiler::Get()->GetState() == profiler::Profiler::kRunning;
-#else
-      bool profiling = false;
-#endif
       Engine::Get()->Push(seg_op.opr, seg_op.ctx, 0, profiling);
       nid = seg_op.topo_end - 1;
       continue;
@@ -1505,11 +1497,7 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
       CHECK_EQ(opnode.exec->out_array.size(), 1U);
       CopyFromTo(opnode.exec->in_array[0], &(opnode.exec->out_array[0]));
     } else if (opnode.cached_opr != nullptr) {
-#if MXNET_USE_PROFILER
       bool profiling = profiler::Profiler::Get()->GetState() == profiler::Profiler::kRunning;
-#else
-      bool profiling = false;
-#endif
       Engine::Get()->Push(opnode.cached_opr, opnode.ctx, 0, profiling);
     } else {
       LOG(FATAL) << "Not accessed";
@@ -1533,11 +1521,7 @@ GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start,
   if (topo_end <= topo_start) {
     return ret;
   }
-#if MXNET_USE_PROFILER
   std::string opr_names = "[";
-#else
-  std::string opr_names = "Bulk Execution";
-#endif
 
   const auto& idx = graph_.indexed_graph();
   for (size_t nid = topo_start; nid < topo_end; ++nid) {
@@ -1559,9 +1543,7 @@ GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start,
     std::copy(op_node.use_vars.begin(), op_node.use_vars.end(),
               std::inserter(use_vars, use_vars.end()));
     ret.exec_list.push_back(exec);
-#if MXNET_USE_PROFILER
     opr_names += inode.source->op()->name + ",";
-#endif
   }
 
   if (pctx == nullptr) return ret;
@@ -1585,17 +1567,12 @@ GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start,
     }
     on_complete();
   };
-#if MXNET_USE_PROFILER
-    opr_names.pop_back();
-    opr_names += "]";
-    // the lifetime of `opr_names.c_str()` is same with opr_names
-    // you need to copy it out. (potential memory leak risk)
-    char *p_opr_name = new char[opr_names.size() + 1];
-    memcpy(p_opr_name, opr_names.c_str(), opr_names.size() + 1);
-#endif
+  opr_names.pop_back();
+  opr_names += "]";
+  auto iter = cached_seg_opr_names_.insert(opr_names).first;
   ret.opr = Engine::Get()->NewOperator(
-      exec_fun, use_vars, mutate_vars, FnProperty::kNormal,
-      PROFILER_MESSAGE(p_opr_name));
+    exec_fun, use_vars, mutate_vars, FnProperty::kNormal,
+    iter->c_str());
   return ret;
 }
 }  // namespace exec
diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h
index ee32db72cebd..3f1ebe568f80 100644
--- a/src/executor/graph_executor.h
+++ b/src/executor/graph_executor.h
@@ -34,6 +34,7 @@
 #include <nnvm/op_attr_types.h>
 #include <nnvm/graph_attr_types.h>
 #include <map>
+#include <unordered_set>
 #include <string>
 #include <utility>
 #include <vector>
@@ -241,6 +242,8 @@ class GraphExecutor : public Executor {
   bool prefer_bulk_execution_;
   // cached segment operator
   std::vector<CachedSegOpr> cached_seg_opr_;
+  // cached segment operator name (needs a longer lifecycle than cached_seg_opr_)
+  std::unordered_set<std::string> cached_seg_opr_names_;
   // verbose logging
   bool log_verbose_ = false;
 };
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 044ab823f770..e7f00b52d659 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -381,7 +381,7 @@ inline void PushFCompute(const FCompute& fn,
         rctx.get_stream<gpu>()->Wait();
       }
     }, ctx, read_vars, write_vars, FnProperty::kNormal,
-    0, PROFILER_MESSAGE(op->name.c_str()));
+    0, op->name.c_str());
 }
 
 inline void PushFComputeEx(const FComputeEx& fn,
@@ -413,7 +413,7 @@ inline void PushFComputeEx(const FComputeEx& fn,
   } else {
     CHECK(exec_type == ExecType::kSync);
     Engine::Get()->PushSync(run, ctx, read_vars, write_vars, FnProperty::kNormal,
-                            0, PROFILER_MESSAGE(op->name.c_str()));
+                            0, op->name.c_str());
   }
 }
 
@@ -449,7 +449,7 @@ inline void PushOperator(const OpStatePtr& state,
             rctx.get_stream<gpu>()->Wait();
           }
         }, ctx, read_vars, write_vars, FnProperty::kNormal,
-        0, PROFILER_MESSAGE(op->name.c_str()));
+        0, op->name.c_str());
   } else {
     CHECK(fcompute != nullptr)
         << "One of FStatefulCompute and FStatefulComputeEx must be registered "
@@ -484,12 +484,12 @@ inline void PushOperator(const OpStatePtr& state,
           [=](RunContext rctx) {
             run(rctx, engine::CallbackOnComplete());
           }, ctx, read_vars, write_vars, FnProperty::kNormal,
-          0, PROFILER_MESSAGE(op->name.c_str()));
+          0, op->name.c_str());
     } else {
       CHECK(exec_type == ExecType::kAsync);
       Engine::Get()->PushAsync(
           run, ctx, read_vars, write_vars, FnProperty::kAsync,
-          0, PROFILER_MESSAGE(op->name.c_str()));
+          0, op->name.c_str());
     }
   }
 }
diff --git a/src/io/image_io.cc b/src/io/image_io.cc
index f6183a12c2dc..a996a2208d79 100644
--- a/src/io/image_io.cc
+++ b/src/io/image_io.cc
@@ -206,7 +206,7 @@ void Imdecode(const nnvm::NodeAttrs& attrs,
       ImdecodeImpl(param.flag, param.to_rgb, str_img, len,
                    const_cast<NDArray*>(&ndout));
     }, ndout.ctx(), {ndin.var()}, {ndout.var()},
-    FnProperty::kNormal, 0, PROFILER_MESSAGE("Imdecode"));
+    FnProperty::kNormal, 0, "Imdecode");
 #else
   LOG(FATAL) << "Build with USE_OPENCV=1 for image io.";
 #endif  // MXNET_USE_OPENCV
@@ -245,7 +245,7 @@ void Imread(const nnvm::NodeAttrs& attrs,
       ImdecodeImpl(param.flag, param.to_rgb, buff.get(), fsize,
                    const_cast<NDArray*>(&ndout));
     }, ndout.ctx(), {}, {ndout.var()},
-    FnProperty::kNormal, 0, PROFILER_MESSAGE("Imread"));
+    FnProperty::kNormal, 0, "Imread");
 #else
   LOG(FATAL) << "Build with USE_OPENCV=1 for image io.";
 #endif  // MXNET_USE_OPENCV
diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h
index 3085966e99b0..96248998902c 100644
--- a/src/kvstore/comm.h
+++ b/src/kvstore/comm.h
@@ -158,7 +158,7 @@ class CommCPU : public Comm {
           ReduceSumCPU(reduce);
           on_complete();
         }, Context::CPU(), const_vars, {reduce[0].var()},
-        FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreReduce"));
+        FnProperty::kCPUPrioritized, priority, "KVStoreReduce");
 
     } else {
       // buf.merged is a sparse ndarray.
@@ -188,7 +188,7 @@ class CommCPU : public Comm {
             : mxnet::ndarray::ElementwiseSum(rctx.get_stream<cpu>(), rsc, reduce, &out);
           on_complete();
         }, Context::CPU(), const_vars, {result.var(), rsc.var},
-        FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreReduce"));
+        FnProperty::kCPUPrioritized, priority, "KVStoreReduce");
     }
 
     return buf.merged;
@@ -235,7 +235,7 @@ class CommCPU : public Comm {
                                                 &temp);
           on_complete();
         }, Context::CPU(), {src.var(), row_id.var()}, {retained_cpu.var()},
-        FnProperty::kNormal, priority, PROFILER_MESSAGE("KVStoreSparseRetain"));
+        FnProperty::kNormal, priority, "KVStoreSparseRetain");
       // if retained_cpu == out, CopyFromTo will ignore the copy operation
       CopyFromTo(retained_cpu, out, priority);
     }
@@ -592,7 +592,7 @@ class CommDevice : public Comm {
           }
           on_complete();
         }, out_gpu.ctx(), {src.var(), row_id.var()}, {out_gpu.var()},
-      FnProperty::kNormal, priority, PROFILER_MESSAGE("KVStoreSparseRetain"));
+      FnProperty::kNormal, priority, "KVStoreSparseRetain");
       CopyFromTo(out_gpu, out, priority);
     }
   }
diff --git a/src/kvstore/gradient_compression.cc b/src/kvstore/gradient_compression.cc
index b8c626cd53a8..e94a0570d1f4 100644
--- a/src/kvstore/gradient_compression.cc
+++ b/src/kvstore/gradient_compression.cc
@@ -129,7 +129,7 @@ void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *t
         std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
         Quantize2BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
       }, from.ctx(), {from.var()}, {to->var(), residual->var()},
-      mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU"));
+      mxnet::FnProperty::kNormal, priority, "QuantizeCPU");
     } else {
 #if MXNET_USE_CUDA
       if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
@@ -139,7 +139,7 @@ void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *t
           // Wait GPU kernel to complete
           ctx.get_stream<mshadow::gpu>()->Wait();
         }, from.ctx(), {from.var()}, {to->var(), residual->var()},
-        mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU"));
+        mxnet::FnProperty::kNormal, priority, "QuantizeGPU");
       } else {
         LOG(FATAL) << "unknown device mask";
       }
@@ -165,7 +165,7 @@ void GradientCompression::Dequantize(const mxnet::NDArray &from, mxnet::NDArray
         std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
         Dequantize2BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
       }, from.ctx(), {from.var()}, {to->var()},
-      mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU"));
+      mxnet::FnProperty::kNormal, priority, "DequantizeCPU");
     } else {
 #if MXNET_USE_CUDA
       if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
@@ -175,7 +175,7 @@ void GradientCompression::Dequantize(const mxnet::NDArray &from, mxnet::NDArray
           // Wait GPU kernel to complete
           ctx.get_stream<mshadow::gpu>()->Wait();
         }, from.ctx(), {from.var()}, {to->var()},
-        mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU"));
+        mxnet::FnProperty::kNormal, priority, "DequantizeGPU");
       } else {
         LOG(FATAL) << "unknown device mask";
       }
diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h
index 7ab5783f7fce..afba9ac5f274 100644
--- a/src/kvstore/kvstore_dist.h
+++ b/src/kvstore/kvstore_dist.h
@@ -250,7 +250,7 @@ class KVStoreDist : public KVStoreLocal {
           {recv_buf.var()},
           FnProperty::kNormal,
           priority,
-          PROFILER_MESSAGE("KVStoreDistDefaultStoragePull"));
+          "KVStoreDistDefaultStoragePull");
 
       comm_->Broadcast(key, recv_buf, grouped_vals[i], priority);
     }
@@ -392,7 +392,7 @@ class KVStoreDist : public KVStoreLocal {
       {},
       FnProperty::kNormal,
       priority,
-      PROFILER_MESSAGE("KVStoreDistCompressedPush"));
+      "KVStoreDistCompressedPush");
   }
 
   void PushDefault(int key, const NDArray &send_buf, const PSKV& pskv, int priority) {
@@ -414,7 +414,7 @@ class KVStoreDist : public KVStoreLocal {
         {},
         FnProperty::kNormal,
         priority,
-        PROFILER_MESSAGE("KVStoreDistDefaultPush"));
+        "KVStoreDistDefaultPush");
   }
 
   // push row sparse gradient
@@ -447,7 +447,7 @@ class KVStoreDist : public KVStoreLocal {
         {},
         FnProperty::kNormal,
         priority,
-        PROFILER_MESSAGE("KVStoreDistRowSparsePush"));
+        "KVStoreDistRowSparsePush");
   }
 
 
@@ -490,7 +490,7 @@ class KVStoreDist : public KVStoreLocal {
       {recv_buf.var()},
       FnProperty::kNormal,
       priority,
-      PROFILER_MESSAGE("KVStoreDistRowSparsePull"));
+      "KVStoreDistRowSparsePull");
   }
 
   /**
diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h
index 69fb37ec5f39..3383c97f9262 100644
--- a/src/kvstore/kvstore_local.h
+++ b/src/kvstore/kvstore_local.h
@@ -404,7 +404,7 @@ class KVStoreLocal : public KVStore {
         }
         on_complete();
       }, out.ctx(), {data_in_ctx.var()}, mutate_vars,
-      FnProperty::kNormal, priority, PROFILER_MESSAGE("KVStoreUnique"));
+      FnProperty::kNormal, priority, "KVStoreUnique");
     return out;
   }
 
diff --git a/src/kvstore/kvstore_nccl.h b/src/kvstore/kvstore_nccl.h
index e97a6d7f7e55..95ee8147a153 100644
--- a/src/kvstore/kvstore_nccl.h
+++ b/src/kvstore/kvstore_nccl.h
@@ -309,7 +309,7 @@ class KVStoreNCCL : public KVStoreLocal {
       mutate_vars,
       FnProperty::kCPUPrioritized,
       priority,
-      PROFILER_MESSAGE("KVStoreReduce"));
+      "KVStoreReduce");
   }
 
   virtual void Broadcast(const std::vector<int> keys,
@@ -413,7 +413,7 @@ class KVStoreNCCL : public KVStoreLocal {
       mutable_vars,
       FnProperty::kCPUPrioritized,
       priority,
-      PROFILER_MESSAGE("KVStoreBCast"));
+      "KVStoreBCast");
   }
 
   // Function that waits for NCCL collective to complete
@@ -434,7 +434,7 @@ class KVStoreNCCL : public KVStoreLocal {
       mutate_vars,
       FnProperty::kCPUPrioritized,
       priority,
-      PROFILER_MESSAGE("KVStoreStreamSync"));
+      "KVStoreStreamSync");
   }
 
   // Initialize single key
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 52b96fad6929..7debfea14eea 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -562,7 +562,7 @@ void NDArray::Reorder2DefaultAsync() {
       tmp.ptr_->Reorder2Default();
       on_complete();
     }, ctx(), const_vars, mutable_vars,
-    FnProperty::kNormal, 0, PROFILER_MESSAGE("Reorder2Default"));
+    FnProperty::kNormal, 0, "Reorder2Default");
 }
 
 void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::primitive_desc &desc) {
@@ -574,7 +574,7 @@ void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::primitive_desc &desc)
       tmp.ptr_->MKLDNNDataReorder(desc);
       on_complete();
     }, ctx(), const_vars, mutable_vars,
-    FnProperty::kNormal, 0, PROFILER_MESSAGE("Reorder"));
+    FnProperty::kNormal, 0, "Reorder");
 }
 
 const mkldnn::memory *NDArray::GetMKLDNNData() const {
@@ -1236,7 +1236,7 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority) {
         CopyFromToImpl<cpu, cpu>(from, to, ctx, requested);
         on_complete();
       }, from.ctx(), const_vars, mutable_vars,
-      FnProperty::kNormal, priority, PROFILER_MESSAGE("CopyCPU2CPU"));
+      FnProperty::kNormal, priority, "CopyCPU2CPU");
   } else {
 #if MXNET_USE_CUDA
     if (a == cpu::kDevMask && b == gpu::kDevMask) {
@@ -1246,7 +1246,7 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority) {
           ctx.get_stream<gpu>()->Wait();
           on_complete();
         }, to.ctx(), const_vars, mutable_vars,
-        FnProperty::kCopyToGPU, priority, PROFILER_MESSAGE("CopyCPU2GPU"));
+        FnProperty::kCopyToGPU, priority, "CopyCPU2GPU");
     } else if (a == gpu::kDevMask && b == cpu::kDevMask) {
       Engine::Get()->PushAsync(
         [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
@@ -1254,7 +1254,7 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority) {
           ctx.get_stream<gpu>()->Wait();
           on_complete();
         }, from.ctx(), const_vars, mutable_vars,
-        FnProperty::kCopyFromGPU, priority, PROFILER_MESSAGE("CopyGPU2CPU"));
+        FnProperty::kCopyFromGPU, priority, "CopyGPU2CPU");
     } else if (a == gpu::kDevMask && b == gpu::kDevMask) {
       Engine::Get()->PushAsync(
         [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
@@ -1263,7 +1263,7 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority) {
           on_complete();
         }, from.ctx(), const_vars, mutable_vars,
         from.dtype() != to.dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
-        priority, PROFILER_MESSAGE("CopyGPU2GPU"));
+        priority, "CopyGPU2GPU");
     } else {
       LOG(FATAL) << "unknown device mask";
     }
@@ -1326,7 +1326,7 @@ void ElementwiseSum(const std::vector<NDArray> &source, NDArray *out, int priori
             // Wait GPU kernel to complete
             ctx.get_stream<gpu>()->Wait();
           }, out->ctx(), const_vars, {ret.var()},
-          FnProperty::kNormal, priority, PROFILER_MESSAGE("DenseElementwiseSum"));
+          FnProperty::kNormal, priority, "DenseElementwiseSum");
         break;
       }
 #endif
@@ -1355,7 +1355,7 @@ void ElementwiseSum(const std::vector<NDArray> &source, NDArray *out, int priori
           default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
         }
       }, ret.ctx(), const_vars, {ret.var(), rsc.var},
-    FnProperty::kNormal, priority, PROFILER_MESSAGE("RowSparseElementwiseSum"));
+    FnProperty::kNormal, priority, "RowSparseElementwiseSum");
   } else {
     LOG(FATAL) << "Not implemented for storage_type " << common::stype_string(stype);
   }
@@ -1831,7 +1831,7 @@ void NDArray::SyncCopyFromCPU(const void *data, size_t size) const {
         rctx.get_stream<gpu>()->Wait();
         on_complete();
       }, this->ctx(), {}, {this->var()},
-      FnProperty::kCopyToGPU, 0, PROFILER_MESSAGE("SyncCopyCPU2GPU"));
+      FnProperty::kCopyToGPU, 0, "SyncCopyCPU2GPU");
     this->WaitToRead();
 #else
     LOG(FATAL) << "GPU is not enabled";
@@ -1888,7 +1888,7 @@ void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) {
         TBlob dst_data = get_dst_data(src_data.shape_);
         ndarray::Copy<cpu, cpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
       }, this->ctx(), const_vars, {this->var()},
-      FnProperty::kNormal, 0, PROFILER_MESSAGE("SyncCopyFromNDArrayCPU2CPU"));
+      FnProperty::kNormal, 0, "SyncCopyFromNDArrayCPU2CPU");
   } else {
 #if MXNET_USE_CUDA
     if (src_dev_mask == cpu::kDevMask && dst_dev_mask == gpu::kDevMask) {
@@ -1900,7 +1900,7 @@ void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) {
           rctx.get_stream<gpu>()->Wait();
           on_complete();
         }, this->ctx(), const_vars, {this->var()},
-        FnProperty::kCopyToGPU, 0, PROFILER_MESSAGE("SyncCopyFromNDArrayCPU2GPU"));
+        FnProperty::kCopyToGPU, 0, "SyncCopyFromNDArrayCPU2GPU");
     } else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == cpu::kDevMask) {
       Engine::Get()->PushAsync(
         [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
@@ -1910,7 +1910,7 @@ void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) {
           rctx.get_stream<gpu>()->Wait();
           on_complete();
         }, this->ctx(), const_vars, {this->var()},
-        FnProperty::kCopyFromGPU, 0, PROFILER_MESSAGE("SyncCopyFromNDArrayGPU2CPU"));
+        FnProperty::kCopyFromGPU, 0, "SyncCopyFromNDArrayGPU2CPU");
     } else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == gpu::kDevMask) {
       Engine::Get()->PushAsync(
         [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
@@ -1921,7 +1921,7 @@ void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) {
           on_complete();
         }, this->ctx(), const_vars, {this->var()},
         src.dtype() != this->dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
-        0, PROFILER_MESSAGE("SyncCopyFromNDArrayGPU2GPU"));
+        0, "SyncCopyFromNDArrayGPU2GPU");
     } else {
       LOG(FATAL) << "unknown device mask";
     }
@@ -1966,7 +1966,7 @@ void NDArray::SyncCopyToCPU(void *data, size_t size) const {
         rctx.get_stream<gpu>()->Wait();
         on_complete();
       }, this->ctx(), {this->var()}, {},
-      FnProperty::kCopyFromGPU, 0, PROFILER_MESSAGE("SyncCopyGPU2CPU"));
+      FnProperty::kCopyFromGPU, 0, "SyncCopyGPU2CPU");
     this->WaitToWrite();
 #else
     LOG(FATAL) << "GPU is not enabled";
@@ -1981,14 +1981,14 @@ void NDArray::SyncCheckFormat(const bool full_check) const {
     Engine::Get()->PushSync([&](RunContext rctx) {
         common::CheckFormatWrapper<cpu>(rctx, *this, err_cpu, full_check);
       }, this->ctx(), {this->var()}, {},
-      FnProperty::kNormal, 0, PROFILER_MESSAGE("CheckFormat"));
+      FnProperty::kNormal, 0, "CheckFormat");
   } else {
 #if MXNET_USE_CUDA
     Engine::Get()->PushSync([&](RunContext rctx) {
         common::CheckFormatWrapper<gpu>(rctx, *this, err_cpu, full_check);
         rctx.get_stream<gpu>()->Wait();
       }, this->ctx(), {this->var()}, {},
-      FnProperty::kNormal, 0, PROFILER_MESSAGE("CheckFormat"));
+      FnProperty::kNormal, 0, "CheckFormat");
 #else
     LOG(FATAL) << "GPU is not enabled";
 #endif
diff --git a/src/operator/custom/custom-inl.h b/src/operator/custom/custom-inl.h
index 38aeefd66a4e..13c8d3434b18 100644
--- a/src/operator/custom/custom-inl.h
+++ b/src/operator/custom/custom-inl.h
@@ -91,7 +91,7 @@ class CustomOperator {
         Engine::Get()->PushSync([=](RunContext rctx) {
             ctx.async_on_complete();
           }, ctx.run_ctx.ctx, vars, {},
-          FnProperty::kNormal, 0, PROFILER_MESSAGE("CustomOperator"));
+          FnProperty::kNormal, 0, "CustomOperator");
       });
     cv_.notify_all();
   }
diff --git a/src/operator/custom/ndarray_op.cc b/src/operator/custom/ndarray_op.cc
index 9ad0d09e3b0d..396c43555dc8 100644
--- a/src/operator/custom/ndarray_op.cc
+++ b/src/operator/custom/ndarray_op.cc
@@ -89,7 +89,7 @@ void NDArrayOp<xpu>::Forward(const OpContext &ctx,
       [ndcpy, ctx](RunContext rctx, Engine::CallbackOnComplete on_complete) {
         ctx.async_on_complete();
         on_complete();
-      }, ndctx, ndvar, {}, FnProperty::kNormal, 0, PROFILER_MESSAGE("NDArrayOpForward"));
+      }, ndctx, ndvar, {}, FnProperty::kNormal, 0, "NDArrayOpForward");
 }
 
 template<typename xpu>
@@ -138,7 +138,7 @@ void NDArrayOp<xpu>::Backward(const OpContext &ctx,
       [ndcpy, ctx](RunContext rctx, Engine::CallbackOnComplete on_complete){
         ctx.async_on_complete();
         on_complete();
-      }, ndctx, ndvar, {}, FnProperty::kNormal, 0, PROFILER_MESSAGE("NDArrayOpBackward"));
+      }, ndctx, ndvar, {}, FnProperty::kNormal, 0, "NDArrayOpBackward");
 }
 
 Operator* NDArrayOpProp::CreateOperator(Context ctx) const {
diff --git a/src/operator/operator_util.cc b/src/operator/operator_util.cc
index bae3cb6a2964..326a1ca38ba3 100644
--- a/src/operator/operator_util.cc
+++ b/src/operator/operator_util.cc
@@ -506,7 +506,7 @@ void SimpleOpRegEntryImpl::RegisterSourceImperative() {
         }
 #endif
       }, ret.ctx(), {}, write_vars,
-      FnProperty::kNormal, 0, PROFILER_MESSAGE("RegisterSourceImperative"));
+      FnProperty::kNormal, 0, "RegisterSourceImperative");
   };
   // register the function.
   NDArrayReg()
@@ -690,7 +690,7 @@ void SimpleOpRegEntryImpl::RegisterUnaryImperative() {
         }
 #endif
       }, src.ctx(), const_vars, write_vars,
-      FnProperty::kNormal, 0, PROFILER_MESSAGE("RegisterUnaryImperative"));
+      FnProperty::kNormal, 0, "RegisterUnaryImperative");
   };
   // register the function.
   NDArrayReg()
@@ -964,7 +964,7 @@ void SimpleOpRegEntryImpl::RegisterBinaryImperative() {
         }
         #endif
       }, lhs.ctx(), const_vars, write_vars,
-      FnProperty::kNormal, 0, PROFILER_MESSAGE("RegisterBinaryImperative"));
+      FnProperty::kNormal, 0, "RegisterBinaryImperative");
   };
   // register the function.
   NDArrayReg()
diff --git a/src/optimizer/sgd-inl.h b/src/optimizer/sgd-inl.h
index 3c0224d28070..12738f8e4053 100644
--- a/src/optimizer/sgd-inl.h
+++ b/src/optimizer/sgd-inl.h
@@ -142,12 +142,12 @@ class SGDOpt : public Optimizer {
         Engine::Get()->PushSync([this, index, w, g, lr, wd](RunContext ctx) {
           call_sgd_mom_update_cpu(ctx, w.data(), g.data(), mom[index].data(), lr, wd, param_);
         }, w.ctx(), {g.var()}, {w.var(), mom[index].var()},
-        FnProperty::kNormal, 0, PROFILER_MESSAGE("SGDOptUpdate"));
+        FnProperty::kNormal, 0, "SGDOptUpdate");
       } else {
         Engine::Get()->PushSync([this, index, w, g, lr, wd](RunContext ctx) {
           call_sgd_update_cpu(ctx, w.data(), g.data(), lr, wd, param_);
         }, w.ctx(), {g.var()}, {w.var()},
-        FnProperty::kNormal, 0, PROFILER_MESSAGE("SGDOptUpdate"));
+        FnProperty::kNormal, 0, "SGDOptUpdate");
       }
       break;
      case Context::kGPU:
@@ -156,12 +156,12 @@ class SGDOpt : public Optimizer {
         Engine::Get()->PushSync([this, index, w, g, lr, wd](RunContext ctx) {
           call_sgd_mom_update_gpu(ctx, w.data(), g.data(), mom[index].data(), lr, wd, param_);
         }, w.ctx(), {g.var()}, {w.var(), mom[index].var()},
-        FnProperty::kNormal, 0, PROFILER_MESSAGE("SGDOptUpdate"));
+        FnProperty::kNormal, 0, "SGDOptUpdate");
       } else {
         Engine::Get()->PushSync([this, index, w, g, lr, wd](RunContext ctx) {
           call_sgd_update_gpu(ctx, w.data(), g.data(), lr, wd, param_);
         }, w.ctx(), {g.var()}, {w.var()},
-        FnProperty::kNormal, 0, PROFILER_MESSAGE("SGDOptUpdate"));
+        FnProperty::kNormal, 0, "SGDOptUpdate");
       }
       break;
 #else
diff --git a/src/profiler/profiler.cc b/src/profiler/profiler.cc
index e6dafb3d0916..f2d14cf2729a 100644
--- a/src/profiler/profiler.cc
+++ b/src/profiler/profiler.cc
@@ -98,7 +98,6 @@ Profiler::~Profiler() {
 }
 
 Profiler* Profiler::Get(std::shared_ptr<Profiler> *sp) {
-#if MXNET_USE_PROFILER
   static std::mutex mtx;
   static std::shared_ptr<Profiler> prof = nullptr;
   if (!prof) {
@@ -111,9 +110,6 @@ Profiler* Profiler::Get(std::shared_ptr<Profiler> *sp) {
     *sp = prof;
   }
   return prof.get();
-#else
-  return nullptr;
-#endif
 }
 
 void Profiler::SetState(ProfilerState state) {
diff --git a/src/profiler/storage_profiler.h b/src/profiler/storage_profiler.h
index b9a7e01e8463..bcbe7e7e3ffd 100644
--- a/src/profiler/storage_profiler.h
+++ b/src/profiler/storage_profiler.h
@@ -27,7 +27,6 @@
 namespace mxnet {
 namespace storage {
 
-#if MXNET_USE_PROFILER
 /*!
  * \brief Storage allocation/deallocation profiling via ProfileCounters
  */
@@ -103,8 +102,6 @@ class DeviceStorageProfiler {
   std::vector<std::shared_ptr<profiler::ProfileCounter>> mem_counters_;
 };
 
-#endif  // MXNET_USE_PROFILER
-
 }  // namespace storage
 }  // namespace mxnet
 
diff --git a/src/resource.cc b/src/resource.cc
index c2b260985a5f..18927f0cd337 100644
--- a/src/resource.cc
+++ b/src/resource.cc
@@ -211,7 +211,7 @@ class ResourceManagerImpl : public ResourceManager {
           r->Seed(seed);
           on_complete();
         }, ctx, {}, {resource.var},
-        FnProperty::kNormal, 0, PROFILER_MESSAGE("ResourceRandomSetSeed"));
+        FnProperty::kNormal, 0, "ResourceRandomSetSeed");
     }
   };
 
@@ -284,7 +284,7 @@ class ResourceManagerImpl : public ResourceManager {
           common::random::RandGenerator<xpu>::AllocState(r);
           r->Seed(rctx.get_stream<xpu>(), seed);
         }, ctx, {}, {resource[i].var},
-        FnProperty::kNormal, 0, PROFILER_MESSAGE("ResourceParallelRandomSetSeed"));
+        FnProperty::kNormal, 0, "ResourceParallelRandomSetSeed");
         sampler[i] = r;
         resource[i].ptr_ = sampler[i];
         resource[i].req = ResourceRequest(ResourceRequest::kParallelRandom);
@@ -310,7 +310,7 @@ class ResourceManagerImpl : public ResourceManager {
           r->Seed(rctx.get_stream<xpu>(), seed);
           on_complete();
         }, ctx, {}, {resource[i].var},
-        FnProperty::kNormal, 0, PROFILER_MESSAGE("ResourceNativeRandomSetSeed"));
+        FnProperty::kNormal, 0, "ResourceNativeRandomSetSeed");
       }
       // reset pointer to ensure the same result with the same seed.
       curr_ptr.store(0);
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index a4bccec0a048..44ae36594703 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -74,9 +74,7 @@ class StorageImpl : public Storage {
   // internal storage managers
   std::array<common::LazyAllocArray<storage::StorageManager>,
              kMaxNumberOfDevices> storage_managers_;
-#if MXNET_USE_PROFILER
   storage::DeviceStorageProfiler profiler_;
-#endif  // MXNET_USE_PROFILER
 };  // struct Storage::Impl
 #if MXNET_USE_CUDA
 int StorageImpl::num_gpu_device = 0;
@@ -133,9 +131,7 @@ void StorageImpl::Alloc(Storage::Handle* handle) {
 
   this->ActivateDevice(handle->ctx);
   manager->Alloc(handle);
-#if MXNET_USE_PROFILER
   profiler_.OnAlloc(*handle);
-#endif  // MXNET_USE_PROFILER
 }
 
 void StorageImpl::Free(Storage::Handle handle) {
@@ -148,9 +144,7 @@ void StorageImpl::Free(Storage::Handle handle) {
       });
   this->ActivateDevice(ctx);
   manager->Free(handle);
-#if MXNET_USE_PROFILER
   profiler_.OnFree(handle);
-#endif  // MXNET_USE_PROFILER
 }
 
 void StorageImpl::DirectFree(Storage::Handle handle) {
@@ -163,9 +157,7 @@ void StorageImpl::DirectFree(Storage::Handle handle) {
       });
   this->ActivateDevice(ctx);
   manager->DirectFree(handle);
-#if MXNET_USE_PROFILER
   profiler_.OnFree(handle);
-#endif  // MXNET_USE_PROFILER
 }
 
 void StorageImpl::SharedIncrementRefCount(Storage::Handle handle) {