From 431eb5c618205f0c5ee859f08ecd6c4ff1b06e1b Mon Sep 17 00:00:00 2001 From: Chris Olivier Date: Sun, 1 Apr 2018 16:19:45 -0700 Subject: [PATCH] [MXNET-247] Always build profiler (#10308) * Always build profiler * Update naive_engine.cc * remove PROFILE_MESSAGE macro * Remove USE_PROFILER=1 from CI runs --- CMakeLists.txt | 4 -- Makefile | 5 -- ci/docker/runtime_functions.sh | 39 +++++-------- include/mxnet/base.h | 11 +--- make/config.mk | 3 - src/c_api/c_api_profile.cc | 90 ----------------------------- src/common/rtc.cc | 2 +- src/engine/naive_engine.cc | 13 +---- src/engine/threaded_engine.cc | 12 +--- src/engine/threaded_engine.h | 6 -- src/executor/graph_executor.cc | 37 +++--------- src/executor/graph_executor.h | 3 + src/imperative/imperative_utils.h | 10 ++-- src/io/image_io.cc | 4 +- src/kvstore/comm.h | 8 +-- src/kvstore/gradient_compression.cc | 8 +-- src/kvstore/kvstore_dist.h | 10 ++-- src/kvstore/kvstore_local.h | 2 +- src/kvstore/kvstore_nccl.h | 6 +- src/ndarray/ndarray.cc | 32 +++++----- src/operator/custom/custom-inl.h | 2 +- src/operator/custom/ndarray_op.cc | 4 +- src/operator/operator_util.cc | 6 +- src/optimizer/sgd-inl.h | 8 +-- src/profiler/profiler.cc | 4 -- src/profiler/storage_profiler.h | 3 - src/resource.cc | 6 +- src/storage/storage.cc | 8 --- 28 files changed, 84 insertions(+), 262 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 116de37fb857..db14dadf80f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -653,10 +653,6 @@ if(MSVC AND USE_MXNET_LIB_NAMING) set_target_properties(mxnet PROPERTIES OUTPUT_NAME "libmxnet") endif() -if(USE_PROFILER) - add_definitions(-DMXNET_USE_PROFILER) -endif() - add_subdirectory(tests) include(GNUInstallDirs) diff --git a/Makefile b/Makefile index dba649f73112..ae57114f4905 100644 --- a/Makefile +++ b/Makefile @@ -100,11 +100,6 @@ else NVCCFLAGS += -std=c++11 -Xcompiler -D_FORCE_INLINES -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS) endif -# CFLAGS for profiler -ifeq ($(USE_PROFILER), 1) - CFLAGS += -DMXNET_USE_PROFILER=1 -endif - # CFLAGS for segfault logger ifeq ($(USE_SIGNAL_HANDLER), 1) CFLAGS += -DMXNET_USE_SIGNAL_HANDLER=1 diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index f35de6bef0b9..d6f07da942a5 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -179,7 +179,6 @@ build_centos7_cpu() { DEV=1 \ USE_LAPACK=1 \ USE_LAPACK_PATH=/usr/lib64/liblapack.so \ - USE_PROFILER=1 \ USE_BLAS=openblas \ -j$(nproc) } @@ -191,7 +190,6 @@ build_centos7_mkldnn() { DEV=1 \ USE_LAPACK=1 \ USE_LAPACK_PATH=/usr/lib64/liblapack.so \ - USE_PROFILER=1 \ USE_MKLDNN=1 \ USE_BLAS=openblas \ -j$(nproc) @@ -204,7 +202,6 @@ build_centos7_gpu() { DEV=1 \ USE_LAPACK=1 \ USE_LAPACK_PATH=/usr/lib64/liblapack.so \ - USE_PROFILER=1 \ USE_BLAS=openblas \ USE_CUDA=1 \ USE_CUDA_PATH=/usr/local/cuda \ @@ -216,7 +213,6 @@ build_ubuntu_cpu_openblas() { set -ex make \ DEV=1 \ - USE_PROFILER=1 \ USE_CPP_PACKAGE=1 \ USE_BLAS=openblas \ -j$(nproc) @@ -225,7 +221,6 @@ build_ubuntu_cpu_openblas() { build_ubuntu_cpu_clang39() { set -ex make \ - USE_PROFILER=1 \ USE_CPP_PACKAGE=1 \ USE_BLAS=openblas \ USE_OPENMP=0 \ @@ -237,7 +232,6 @@ build_ubuntu_cpu_clang39() { build_ubuntu_cpu_clang50() { set -ex make \ - USE_PROFILER=1 \ USE_CPP_PACKAGE=1 \ USE_BLAS=openblas \ USE_OPENMP=1 \ @@ -249,7 +243,6 @@ build_ubuntu_cpu_clang50() { build_ubuntu_cpu_clang39_mkldnn() { set -ex make \ - USE_PROFILER=1 \ USE_CPP_PACKAGE=1 \ USE_BLAS=openblas \ USE_MKLDNN=1 \ @@ -262,7 +255,6 @@ build_ubuntu_cpu_clang39_mkldnn() { build_ubuntu_cpu_clang50_mkldnn() { set -ex make \ - USE_PROFILER=1 \ USE_CPP_PACKAGE=1 \ USE_BLAS=openblas \ USE_MKLDNN=1 \ @@ -276,7 +268,6 @@ build_ubuntu_cpu_mkldnn() { set -ex make \ DEV=1 \ - USE_PROFILER=1 \ USE_CPP_PACKAGE=1 \ USE_BLAS=openblas \ USE_MKLDNN=1 \ @@ -287,7 +278,6 @@ build_ubuntu_gpu_mkldnn() { set -ex make \ DEV=1 \ - USE_PROFILER=1 \ USE_CPP_PACKAGE=1 \ USE_BLAS=openblas \ USE_MKLDNN=1 \ @@ -301,7 +291,6 @@ build_ubuntu_gpu_cuda91_cudnn7() { set -ex make \ DEV=1 \ - USE_PROFILER=1 \ USE_BLAS=openblas \ USE_CUDA=1 \ USE_CUDA_PATH=/usr/local/cuda \ @@ -314,7 +303,7 @@ build_ubuntu_amalgamation() { set -ex # Amalgamation can not be run with -j nproc make -C amalgamation/ clean - make -C amalgamation/ USE_BLAS=openblas + make -C amalgamation/ USE_BLAS=openblas } build_ubuntu_amalgamation_min() { @@ -335,7 +324,7 @@ build_ubuntu_gpu_cmake_mkldnn() { -DCMAKE_BUILD_TYPE=Release \ -G Ninja \ /work/mxnet - + ninja -v } @@ -350,7 +339,7 @@ build_ubuntu_gpu_cmake() { -DCMAKE_BUILD_TYPE=Release \ -G Ninja \ /work/mxnet - + ninja -v } @@ -367,7 +356,7 @@ sanity_check() { unittest_ubuntu_python2_cpu() { set -ex - export PYTHONPATH=./python/ + export PYTHONPATH=./python/ # MXNET_MKLDNN_DEBUG is buggy and produces false positives # https://github.com/apache/incubator-mxnet/issues/10026 #export MXNET_MKLDNN_DEBUG=1 # Ignored if not present @@ -379,7 +368,7 @@ unittest_ubuntu_python2_cpu() { unittest_ubuntu_python3_cpu() { set -ex - export PYTHONPATH=./python/ + export PYTHONPATH=./python/ # MXNET_MKLDNN_DEBUG is buggy and produces false positives # https://github.com/apache/incubator-mxnet/issues/10026 #export MXNET_MKLDNN_DEBUG=1 # Ignored if not present @@ -390,9 +379,9 @@ unittest_ubuntu_python3_cpu() { unittest_ubuntu_python2_gpu() { set -ex - export PYTHONPATH=./python/ + export PYTHONPATH=./python/ # MXNET_MKLDNN_DEBUG is buggy and produces false positives - # https://github.com/apache/incubator-mxnet/issues/10026 + # https://github.com/apache/incubator-mxnet/issues/10026 #export MXNET_MKLDNN_DEBUG=1 # Ignored if not present export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0 nosetests-2.7 --verbose tests/python/gpu @@ -400,7 +389,7 @@ unittest_ubuntu_python2_gpu() { unittest_ubuntu_python3_gpu() { set -ex - export PYTHONPATH=./python/ + export PYTHONPATH=./python/ # MXNET_MKLDNN_DEBUG is buggy and produces false positives # https://github.com/apache/incubator-mxnet/issues/10026 #export MXNET_MKLDNN_DEBUG=1 # Ignored if not present @@ -412,9 +401,9 @@ unittest_ubuntu_python3_gpu() { # need to separte it from unittest_ubuntu_python2_gpu() unittest_ubuntu_python2_quantization_gpu() { set -ex - export PYTHONPATH=./python/ + export PYTHONPATH=./python/ # MXNET_MKLDNN_DEBUG is buggy and produces false positives - # https://github.com/apache/incubator-mxnet/issues/10026 + # https://github.com/apache/incubator-mxnet/issues/10026 #export MXNET_MKLDNN_DEBUG=1 # Ignored if not present export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0 nosetests-2.7 --verbose tests/python/quantization_gpu @@ -424,7 +413,7 @@ unittest_ubuntu_python2_quantization_gpu() { # need to separte it from unittest_ubuntu_python3_gpu() unittest_ubuntu_python3_quantization_gpu() { set -ex - export PYTHONPATH=./python/ + export PYTHONPATH=./python/ # MXNET_MKLDNN_DEBUG is buggy and produces false positives # https://github.com/apache/incubator-mxnet/issues/10026 #export MXNET_MKLDNN_DEBUG=1 # Ignored if not present @@ -479,7 +468,7 @@ unittest_centos7_gpu() { python3.6 -m "nose" --with-timer --verbose tests/python/gpu } -integrationtest_ubuntu_cpu_onnx() { +integrationtest_ubuntu_cpu_onnx() { set -ex export PYTHONPATH=./python/ python example/onnx/super_resolution.py @@ -496,7 +485,7 @@ integrationtest_ubuntu_gpu_python() { integrationtest_ubuntu_gpu_caffe() { set -ex - export PYTHONPATH=/work/deps/caffe/python:./python + export PYTHONPATH=/work/deps/caffe/python:./python python tools/caffe_converter/test_converter.py } @@ -545,7 +534,7 @@ test_ubuntu_cpu_python3() { deploy_docs() { set -ex pushd . - + make docs popd diff --git a/include/mxnet/base.h b/include/mxnet/base.h index c0eb97aa0b3f..783002e6fa48 100644 --- a/include/mxnet/base.h +++ b/include/mxnet/base.h @@ -99,15 +99,6 @@ #define MXNET_PREDICT_ONLY 0 #endif -/*! - * \brief define operator message for profiler - */ -#if MXNET_USE_PROFILER -#define PROFILER_MESSAGE(msg) msg -#else -#define PROFILER_MESSAGE(msg) nullptr -#endif - /*! \brief major version */ #define MXNET_MAJOR 1 /*! \brief minor version */ @@ -121,7 +112,7 @@ /*! * \brief define function name as profiler message */ -#define PROFILER_MESSAGE_FUNCNAME PROFILER_MESSAGE(__FUNCTION__) +#define PROFILER_MESSAGE_FUNCNAME (__FUNCTION__) /*! \brief namespace of mxnet */ namespace mxnet { diff --git a/make/config.mk b/make/config.mk index fa429f31f298..9eded6f50807 100644 --- a/make/config.mk +++ b/make/config.mk @@ -47,9 +47,6 @@ DEV = 0 # whether compile with debug DEBUG = 0 -# whether compile with profiler -USE_PROFILER = - # whether to turn on segfault signal handler to log the stack trace USE_SIGNAL_HANDLER = diff --git a/src/c_api/c_api_profile.cc b/src/c_api/c_api_profile.cc index 16b47ad20b47..c946e3b6bd50 100644 --- a/src/c_api/c_api_profile.cc +++ b/src/c_api/c_api_profile.cc @@ -37,7 +37,6 @@ namespace mxnet { // #define PROFILE_API_INCLUDE_AS_EVENT -#if MXNET_USE_PROFILER static profiler::ProfileDomain api_domain("MXNET_C_API"); static profiler::ProfileCounter api_call_counter("MXNet C API Calls", &api_domain); static profiler::ProfileCounter api_concurrency_counter("MXNet C API Concurrency", @@ -114,10 +113,8 @@ class ProfilingThreadData { }; static thread_local ProfilingThreadData thread_profiling_data; -#endif // MXNET_USE_PROFILER extern void on_enter_api(const char *function) { -#if MXNET_USE_PROFILER if (profiler::Profiler::Get()->IsProfiling(profiler::Profiler::kAPI)) { if (!thread_profiling_data.ignore_call_) { ++api_call_counter; @@ -136,10 +133,8 @@ extern void on_enter_api(const char *function) { #endif // PROFILE_API_INCLUDE_AS_EVENT } } -#endif // MXNET_USE_PROFILER } extern void on_exit_api() { -#if MXNET_USE_PROFILER if (profiler::Profiler::Get()->IsProfiling(profiler::Profiler::kAPI)) { if (!thread_profiling_data.ignore_call_) { CHECK(!thread_profiling_data.calls_.empty()); @@ -152,7 +147,6 @@ extern void on_exit_api() { --api_concurrency_counter; } } -#endif // MXNET_USE_PROFILER } /*! @@ -160,16 +154,12 @@ extern void on_exit_api() { */ struct IgnoreProfileCallScope { IgnoreProfileCallScope() { -#if MXNET_USE_PROFILER DCHECK_EQ(thread_profiling_data.ignore_call_, false); thread_profiling_data.ignore_call_ = true; -#endif // MXNET_USE_PROFILER } ~IgnoreProfileCallScope() { -#if MXNET_USE_PROFILER DCHECK_EQ(thread_profiling_data.ignore_call_, true); thread_profiling_data.ignore_call_ = false; -#endif // MXNET_USE_PROFILER } }; @@ -203,16 +193,6 @@ struct PythonProfileObjects { }; static PythonProfileObjects python_profile_objects; -#if !defined(MXNET_USE_PROFILER) || !MXNET_USE_PROFILER -static void warn_not_built_with_profiler_enabled() { - static volatile bool warned_not_built_with_profiler_enabled = false; - if (!warned_not_built_with_profiler_enabled) { - warned_not_built_with_profiler_enabled = true; - LOG(WARNING) << "Need to compile with USE_PROFILER=1 for MXNet Profiling"; - } -} -#endif // MXNET_USE_PROFILER - struct ProfileConfigParam : public dmlc::Parameter { bool profile_all; bool profile_symbolic; @@ -267,7 +247,6 @@ DMLC_REGISTER_PARAMETER(ProfileMarkerScopeParam); int MXSetProfilerConfig(int num_params, const char* const* keys, const char* const* vals) { mxnet::IgnoreProfileCallScope ignore; API_BEGIN(); -#if MXNET_USE_PROFILER std::vector> kwargs; kwargs.reserve(num_params); for (int i = 0; i < num_params; ++i) { @@ -287,9 +266,6 @@ int MXSetProfilerConfig(int num_params, const char* const* keys, const char* con param.continuous_dump, param.dump_period, param.aggregate_stats); -#else - warn_not_built_with_profiler_enabled(); -#endif API_END(); } @@ -297,7 +273,6 @@ int MXAggregateProfileStatsPrint(const char **out_str, int reset) { MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get(); API_BEGIN(); CHECK_NOTNULL(out_str); -#if MXNET_USE_PROFILER profiler::Profiler *profiler = profiler::Profiler::Get(); if (profiler->IsEnableOutput()) { // Register stats up until now @@ -309,10 +284,6 @@ int MXAggregateProfileStatsPrint(const char **out_str, int reset) { stats->Dump(os, reset != 0); } ret->ret_str = os.str(); -#else - warn_not_built_with_profiler_enabled(); - ret->ret_str.clear(); -#endif *out_str = (ret->ret_str).c_str(); API_END(); } @@ -320,14 +291,10 @@ int MXAggregateProfileStatsPrint(const char **out_str, int reset) { int MXDumpProfile(int finished) { mxnet::IgnoreProfileCallScope ignore; API_BEGIN(); -#if MXNET_USE_PROFILER profiler::Profiler *profiler = profiler::Profiler::Get(); CHECK(profiler->IsEnableOutput()) << "Profiler hasn't been run. Config and start profiler first"; profiler->DumpProfile(finished != 0); -#else - warn_not_built_with_profiler_enabled(); -#endif API_END() } @@ -335,7 +302,6 @@ int MXSetProfilerState(int state) { mxnet::IgnoreProfileCallScope ignore; // state, kNotRunning: 0, kRunning: 1 API_BEGIN(); -#if MXNET_USE_PROFILER switch (state) { case profiler::Profiler::kNotRunning: profiler::vtune::vtune_pause(); @@ -345,26 +311,18 @@ int MXSetProfilerState(int state) { break; } profiler::Profiler::Get()->SetState(profiler::Profiler::ProfilerState(state)); -#else - warn_not_built_with_profiler_enabled(); -#endif API_END(); } int MXProfileCreateDomain(const char *domain, ProfileHandle *out) { mxnet::IgnoreProfileCallScope ignore; API_BEGIN(); -#if MXNET_USE_PROFILER auto dom = std::make_shared(domain); { std::unique_lock lock(python_profile_objects.cs_domains_); python_profile_objects.domains_.push_back(dom); } *out = dom.get(); -#else - warn_not_built_with_profiler_enabled(); - *out = nullptr; -#endif API_END(); } @@ -373,7 +331,6 @@ int MXProfileCreateTask(ProfileHandle domain, ProfileHandle *out) { mxnet::IgnoreProfileCallScope ignore; API_BEGIN(); -#if MXNET_USE_PROFILER auto ctr = std::make_shared(task_name, static_cast(domain)); @@ -382,10 +339,6 @@ int MXProfileCreateTask(ProfileHandle domain, python_profile_objects.tasks_.emplace(std::make_pair(ctr.get(), ctr)); } *out = ctr.get(); -#else - warn_not_built_with_profiler_enabled(); - *out = nullptr; -#endif API_END(); } @@ -394,7 +347,6 @@ int MXProfileCreateFrame(ProfileHandle domain, ProfileHandle *out) { mxnet::IgnoreProfileCallScope ignore; API_BEGIN(); -#if MXNET_USE_PROFILER auto ctr = std::make_shared(frame_name, static_cast(domain)); @@ -403,17 +355,12 @@ int MXProfileCreateFrame(ProfileHandle domain, python_profile_objects.frames_.emplace(std::make_pair(ctr.get(), ctr)); } *out = ctr.get(); -#else - warn_not_built_with_profiler_enabled(); - *out = nullptr; -#endif API_END(); } int MXProfileCreateEvent(const char *event_name, ProfileHandle *out) { mxnet::IgnoreProfileCallScope ignore; API_BEGIN(); -#if MXNET_USE_PROFILER auto ctr = std::make_shared(event_name); { @@ -421,17 +368,12 @@ int MXProfileCreateEvent(const char *event_name, ProfileHandle *out) { python_profile_objects.events_.emplace(std::make_pair(ctr.get(), ctr)); } *out = ctr.get(); -#else - warn_not_built_with_profiler_enabled(); - *out = nullptr; -#endif API_END(); } int MXProfileDestroyHandle(ProfileHandle object_handle) { mxnet::IgnoreProfileCallScope ignore; API_BEGIN(); -#if MXNET_USE_PROFILER CHECK_NE(object_handle, static_cast(nullptr)) << "Invalid NULL handle passed to MXProfileDestroyHandle"; std::shared_ptr shared_object_ptr(nullptr); @@ -484,40 +426,28 @@ int MXProfileDestroyHandle(ProfileHandle object_handle) { } } shared_object_ptr.reset(); // Destroy out of lock scope -#else - warn_not_built_with_profiler_enabled(); -#endif API_END(); } int MXProfileDurationStart(ProfileHandle duration_handle) { mxnet::IgnoreProfileCallScope ignore; API_BEGIN(); -#if MXNET_USE_PROFILER CHECK_NOTNULL(duration_handle); static_cast(duration_handle)->start(); -#else - warn_not_built_with_profiler_enabled(); -#endif API_END(); } int MXProfileDurationStop(ProfileHandle duration_handle) { mxnet::IgnoreProfileCallScope ignore; API_BEGIN(); -#if MXNET_USE_PROFILER CHECK_NOTNULL(duration_handle); static_cast(duration_handle)->stop(); -#else - warn_not_built_with_profiler_enabled(); -#endif API_END(); } int MXProfilePause(int paused) { mxnet::IgnoreProfileCallScope ignore; API_BEGIN(); -#if MXNET_USE_PROFILER if (paused) { profiler::vtune::vtune_pause(); profiler::Profiler::Get()->set_paused(true); @@ -525,9 +455,6 @@ int MXProfilePause(int paused) { profiler::Profiler::Get()->set_paused(false); profiler::vtune::vtune_resume(); } -#else - warn_not_built_with_profiler_enabled(); -#endif API_END(); } @@ -536,7 +463,6 @@ int MXProfileCreateCounter(ProfileHandle domain, ProfileHandle *out) { mxnet::IgnoreProfileCallScope ignore; API_BEGIN(); -#if MXNET_USE_PROFILER auto ctr = std::make_shared(counter_name, static_cast(domain)); @@ -545,32 +471,20 @@ int MXProfileCreateCounter(ProfileHandle domain, python_profile_objects.counters_.emplace(std::make_pair(ctr.get(), ctr)); } *out = ctr.get(); -#else - warn_not_built_with_profiler_enabled(); - *out = nullptr; -#endif API_END(); } int MXProfileSetCounter(ProfileHandle counter_handle, uint64_t value) { mxnet::IgnoreProfileCallScope ignore; API_BEGIN(); -#if MXNET_USE_PROFILER static_cast(counter_handle)->operator=(value); -#else - warn_not_built_with_profiler_enabled(); -#endif API_END(); } int MXProfileAdjustCounter(ProfileHandle counter_handle, int64_t by_value) { mxnet::IgnoreProfileCallScope ignore; API_BEGIN(); -#if MXNET_USE_PROFILER static_cast(counter_handle)->operator+=(by_value); -#else - warn_not_built_with_profiler_enabled(); -#endif API_END(); } @@ -579,7 +493,6 @@ int MXProfileSetMarker(ProfileHandle domain, const char *scope) { mxnet::IgnoreProfileCallScope ignore; API_BEGIN(); -#if MXNET_USE_PROFILER ProfileMarkerScopeParam param; std::vector> kwargs = {{ "scope", scope }}; param.Init(kwargs); @@ -588,8 +501,5 @@ int MXProfileSetMarker(ProfileHandle domain, static_cast( param.scope)); marker.mark(); -#else - warn_not_built_with_profiler_enabled(); -#endif API_END(); } diff --git a/src/common/rtc.cc b/src/common/rtc.cc index 444553b128b0..da083c9244ca 100644 --- a/src/common/rtc.cc +++ b/src/common/rtc.cc @@ -178,7 +178,7 @@ void CudaModule::Kernel::Launch( p_args.data(), 0)); CUDA_CALL(cudaStreamSynchronize(s->stream_)); }, ctx, read_vars, write_vars, FnProperty::kNormal, 0, - PROFILER_MESSAGE(mangled_name_.c_str())); + mangled_name_.c_str()); } diff --git a/src/engine/naive_engine.cc b/src/engine/naive_engine.cc index 6246b7316005..1fa530696b36 100644 --- a/src/engine/naive_engine.cc +++ b/src/engine/naive_engine.cc @@ -94,7 +94,6 @@ class NaiveEngine final : public Engine { NaiveOpr *opr = op->Cast(); opr->profiling = profiling && profiler->IsProfiling(profiler::Profiler::kSymbolic); this->PushAsync([&](RunContext ctx, CallbackOnComplete on_complete) { -#if MXNET_USE_PROFILER if (opr->profiling) { std::unique_ptr attrs; if (profiler->AggregateEnabled()) { @@ -107,16 +106,13 @@ class NaiveEngine final : public Engine { if (opr->profiling) { opr->opr_profile->stop(); } -#else - opr->fn(ctx, on_complete); -#endif }, exec_ctx, opr->const_vars, opr->mutable_vars, opr->prop, priority, - PROFILER_MESSAGE(opr->opr_name)); + opr->opr_name); } void PushAsync(AsyncFn exec_fun, @@ -130,7 +126,6 @@ class NaiveEngine final : public Engine { CallbackOnComplete callback = CreateCallback( NaiveEngine::OnComplete, nullptr); this->req_completed_ = false; -#if MXNET_USE_PROFILER profiler::Profiler *profiler = profiler::Profiler::Get(); NaiveOpr *opr = nullptr; const bool profiling = opr_name && profiler->IsProfiling(profiler::Profiler::kImperative); @@ -145,7 +140,6 @@ class NaiveEngine final : public Engine { opr->opr_profile.reset(new profiler::ProfileOperator(opr->opr_name, attrs.release())); opr->opr_profile->start(exec_ctx.dev_type, exec_ctx.dev_id); } -#endif if (exec_ctx.dev_mask() == gpu::kDevMask) { #if MXNET_USE_CUDA size_t dev_id = static_cast(exec_ctx.dev_id); @@ -165,16 +159,14 @@ class NaiveEngine final : public Engine { } CHECK(this->req_completed_) << "NaiveEngine only support synchronize Push so far"; -#if MXNET_USE_PROFILER if (profiling) { opr->opr_profile->stop(); } -#endif } void DeleteVariable(SyncFn delete_fn, Context exec_ctx, VarHandle var) override { this->PushSync(delete_fn, exec_ctx, {}, {var}, - FnProperty::kNormal, 0, PROFILER_MESSAGE("DeleteVariable")); + FnProperty::kNormal, 0, "DeleteVariable"); } void WaitForVar(VarHandle var) override { @@ -204,7 +196,6 @@ class NaiveEngine final : public Engine { std::vector*> streams_; }; // class NaiveEngine - Engine *CreateNaiveEngine() { return new NaiveEngine(); } diff --git a/src/engine/threaded_engine.cc b/src/engine/threaded_engine.cc index ca5602bb4823..29100602469d 100644 --- a/src/engine/threaded_engine.cc +++ b/src/engine/threaded_engine.cc @@ -274,7 +274,7 @@ void ThreadedEngine::DeleteOperator(OprHandle op) { ThreadedOpr::Delete(threaded_opr); on_complete(); }, Context::CPU(), {}, deps, FnProperty::kAsync, 0, - PROFILER_MESSAGE("DeleteOperator")); + "DeleteOperator"); } void ThreadedEngine::Push(OprHandle op, Context exec_ctx, int priority, bool profiling) { @@ -312,11 +312,7 @@ void ThreadedEngine::PushAsync(AsyncFn fn, Context exec_ctx, BulkFlush(); ThreadedOpr *opr = NewOperator(std::move(fn), const_vars, mutable_vars, prop, opr_name, wait); opr->temporary = true; -#if MXNET_USE_PROFILER const bool profiling = profiler_->IsProfiling(profiler::Profiler::kImperative); -#else - const bool profiling = false; -#endif Push(opr, exec_ctx, priority, profiling); } @@ -350,7 +346,7 @@ void ThreadedEngine::DeleteVariable(SyncFn delete_fn, delete_fn(ctx); on_complete(); }, exec_ctx, {}, {var}, FnProperty::kDeleteVar, 0, - PROFILER_MESSAGE("DeleteVariable")); + "DeleteVariable"); } void ThreadedEngine::WaitForVar(VarHandle var) { @@ -379,7 +375,7 @@ void ThreadedEngine::WaitForVar(VarHandle var) { } on_complete(); }, Context::CPU(), {var}, {}, FnProperty::kNormal, 0, - PROFILER_MESSAGE("WaitForVar"), true); + "WaitForVar", true); { std::unique_lock lock{finished_m_}; finished_cv_.wait(lock, [this, &done]() { @@ -463,12 +459,10 @@ void ThreadedEngine::OnCompleteStatic( Engine *engine, void *opr_block_) { OprBlock *opr_block = static_cast(opr_block_); ThreadedOpr *threaded_opr = opr_block->opr; -#if MXNET_USE_PROFILER if (opr_block->profiling && threaded_opr->opr_name) { // record operator end timestamp opr_block->opr_profile->stop(); } -#endif static_cast(engine)->OnComplete(threaded_opr); OprBlock::Delete(opr_block); } diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h index 673fa3bf9ebb..bfb1b1d932ca 100644 --- a/src/engine/threaded_engine.h +++ b/src/engine/threaded_engine.h @@ -305,10 +305,8 @@ class ThreadedEngine : public Engine { objpool_varblk_ref_ = common::ObjectPool::_GetSharedRef(); objpool_var_ref_ = common::ObjectPool::_GetSharedRef(); -#ifdef MXNET_USE_PROFILER // Get a ref to the profiler so that it doesn't get killed before us profiler::Profiler::Get(&profiler_); -#endif // MXNET_USE_PROFILER } ~ThreadedEngine() { { @@ -336,7 +334,6 @@ class ThreadedEngine : public Engine { */ void ExecuteOprBlock(RunContext run_ctx, OprBlock* opr_block) { ThreadedOpr* threaded_opr = opr_block->opr; -#if MXNET_USE_PROFILER if (opr_block->profiling && threaded_opr->opr_name) { std::unique_ptr attrs; if (profiler_->AggregateEnabled()) { @@ -347,7 +344,6 @@ class ThreadedEngine : public Engine { attrs.release())); opr_block->opr_profile->start(ctx.dev_type, ctx.dev_id); } -#endif CallbackOnComplete callback = this->CreateCallback(ThreadedEngine::OnCompleteStatic, opr_block); const bool debug_info = (engine_info_ && debug_push_opr_ == opr_block); @@ -539,10 +535,8 @@ class ThreadedEngine : public Engine { std::shared_ptr > objpool_varblk_ref_; std::shared_ptr > objpool_var_ref_; -#if MXNET_USE_PROFILER /*! \brief Hold a ref count ot the profiler */ std::shared_ptr profiler_; -#endif // MXNET_USE_PROFILER /*! * \brief Disallow copy construction and assignment. diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index 32772f2108d1..9108bae17323 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -1235,11 +1235,7 @@ void GraphExecutor::InitCachedOps() { for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) { const auto& inode = idx[nid]; if (inode.source->is_variable()) continue; -#if MXNET_USE_PROFILER op_nodes_[nid].opr_name = inode.source->op()->name.c_str(); -#else - op_nodes_[nid].opr_name = nullptr; -#endif if (skip_plus_node.at(nid)) { op_nodes_[nid].skip_exec_node = true; continue; } @@ -1309,7 +1305,7 @@ void GraphExecutor::InitCachedOps() { exec->Setup(); on_complete(); }, Context::CPU(), {}, all_vars, FnProperty::kNormal, 0, - PROFILER_MESSAGE("SetupExec")); + "SetupExec"); auto exec_fun = [exec, is_async, is_gpu] ( RunContext ctx, Engine::CallbackOnComplete on_complete) { if (is_async) { @@ -1332,7 +1328,7 @@ void GraphExecutor::InitCachedOps() { // setup the vars op_nodes_[nid].cached_opr = Engine::Get()->NewOperator( exec_fun, use_vars, mutate_vars, FnProperty::kNormal, - PROFILER_MESSAGE(op_nodes_[nid].opr_name)); + op_nodes_[nid].opr_name); op_nodes_[nid].mutate_vars = mutate_vars; op_nodes_[nid].use_vars = use_vars; } @@ -1484,11 +1480,7 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) { auto seg_op = cached_seg_opr_[nid]; // Check segments first if (monitor_callback_ == nullptr && seg_op.opr != nullptr && seg_op.topo_end <= topo_end) { -#if MXNET_USE_PROFILER bool profiling = profiler::Profiler::Get()->GetState() == profiler::Profiler::kRunning; -#else - bool profiling = false; -#endif Engine::Get()->Push(seg_op.opr, seg_op.ctx, 0, profiling); nid = seg_op.topo_end - 1; continue; @@ -1505,11 +1497,7 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) { CHECK_EQ(opnode.exec->out_array.size(), 1U); CopyFromTo(opnode.exec->in_array[0], &(opnode.exec->out_array[0])); } else if (opnode.cached_opr != nullptr) { -#if MXNET_USE_PROFILER bool profiling = profiler::Profiler::Get()->GetState() == profiler::Profiler::kRunning; -#else - bool profiling = false; -#endif Engine::Get()->Push(opnode.cached_opr, opnode.ctx, 0, profiling); } else { LOG(FATAL) << "Not accessed"; @@ -1533,11 +1521,7 @@ GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start, if (topo_end <= topo_start) { return ret; } -#if MXNET_USE_PROFILER std::string opr_names = "["; -#else - std::string opr_names = "Bulk Execution"; -#endif const auto& idx = graph_.indexed_graph(); for (size_t nid = topo_start; nid < topo_end; ++nid) { @@ -1559,9 +1543,7 @@ GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start, std::copy(op_node.use_vars.begin(), op_node.use_vars.end(), std::inserter(use_vars, use_vars.end())); ret.exec_list.push_back(exec); -#if MXNET_USE_PROFILER opr_names += inode.source->op()->name + ","; -#endif } if (pctx == nullptr) return ret; @@ -1585,17 +1567,12 @@ GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start, } on_complete(); }; -#if MXNET_USE_PROFILER - opr_names.pop_back(); - opr_names += "]"; - // the lifetime of `opr_names.c_str()` is same with opr_names - // you need to copy it out. (potential memory leak risk) - char *p_opr_name = new char[opr_names.size() + 1]; - memcpy(p_opr_name, opr_names.c_str(), opr_names.size() + 1); -#endif + opr_names.pop_back(); + opr_names += "]"; + auto iter = cached_seg_opr_names_.insert(opr_names).first; ret.opr = Engine::Get()->NewOperator( - exec_fun, use_vars, mutate_vars, FnProperty::kNormal, - PROFILER_MESSAGE(p_opr_name)); + exec_fun, use_vars, mutate_vars, FnProperty::kNormal, + iter->c_str()); return ret; } } // namespace exec diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h index ee32db72cebd..3f1ebe568f80 100644 --- a/src/executor/graph_executor.h +++ b/src/executor/graph_executor.h @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -241,6 +242,8 @@ class GraphExecutor : public Executor { bool prefer_bulk_execution_; // cached segment operator std::vector cached_seg_opr_; + // cached segment operator name (needs a longer lifecycle than cached_seg_opr_) + std::unordered_set cached_seg_opr_names_; // verbose logging bool log_verbose_ = false; }; diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h index 044ab823f770..e7f00b52d659 100644 --- a/src/imperative/imperative_utils.h +++ b/src/imperative/imperative_utils.h @@ -381,7 +381,7 @@ inline void PushFCompute(const FCompute& fn, rctx.get_stream()->Wait(); } }, ctx, read_vars, write_vars, FnProperty::kNormal, - 0, PROFILER_MESSAGE(op->name.c_str())); + 0, op->name.c_str()); } inline void PushFComputeEx(const FComputeEx& fn, @@ -413,7 +413,7 @@ inline void PushFComputeEx(const FComputeEx& fn, } else { CHECK(exec_type == ExecType::kSync); Engine::Get()->PushSync(run, ctx, read_vars, write_vars, FnProperty::kNormal, - 0, PROFILER_MESSAGE(op->name.c_str())); + 0, op->name.c_str()); } } @@ -449,7 +449,7 @@ inline void PushOperator(const OpStatePtr& state, rctx.get_stream()->Wait(); } }, ctx, read_vars, write_vars, FnProperty::kNormal, - 0, PROFILER_MESSAGE(op->name.c_str())); + 0, op->name.c_str()); } else { CHECK(fcompute != nullptr) << "One of FStatefulCompute and FStatefulComputeEx must be registered " @@ -484,12 +484,12 @@ inline void PushOperator(const OpStatePtr& state, [=](RunContext rctx) { run(rctx, engine::CallbackOnComplete()); }, ctx, read_vars, write_vars, FnProperty::kNormal, - 0, PROFILER_MESSAGE(op->name.c_str())); + 0, op->name.c_str()); } else { CHECK(exec_type == ExecType::kAsync); Engine::Get()->PushAsync( run, ctx, read_vars, write_vars, FnProperty::kAsync, - 0, PROFILER_MESSAGE(op->name.c_str())); + 0, op->name.c_str()); } } } diff --git a/src/io/image_io.cc b/src/io/image_io.cc index f6183a12c2dc..a996a2208d79 100644 --- a/src/io/image_io.cc +++ b/src/io/image_io.cc @@ -206,7 +206,7 @@ void Imdecode(const nnvm::NodeAttrs& attrs, ImdecodeImpl(param.flag, param.to_rgb, str_img, len, const_cast(&ndout)); }, ndout.ctx(), {ndin.var()}, {ndout.var()}, - FnProperty::kNormal, 0, PROFILER_MESSAGE("Imdecode")); + FnProperty::kNormal, 0, "Imdecode"); #else LOG(FATAL) << "Build with USE_OPENCV=1 for image io."; #endif // MXNET_USE_OPENCV @@ -245,7 +245,7 @@ void Imread(const nnvm::NodeAttrs& attrs, ImdecodeImpl(param.flag, param.to_rgb, buff.get(), fsize, const_cast(&ndout)); }, ndout.ctx(), {}, {ndout.var()}, - FnProperty::kNormal, 0, PROFILER_MESSAGE("Imread")); + FnProperty::kNormal, 0, "Imread"); #else LOG(FATAL) << "Build with USE_OPENCV=1 for image io."; #endif // MXNET_USE_OPENCV diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 3085966e99b0..96248998902c 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -158,7 +158,7 @@ class CommCPU : public Comm { ReduceSumCPU(reduce); on_complete(); }, Context::CPU(), const_vars, {reduce[0].var()}, - FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreReduce")); + FnProperty::kCPUPrioritized, priority, "KVStoreReduce"); } else { // buf.merged is a sparse ndarray. @@ -188,7 +188,7 @@ class CommCPU : public Comm { : mxnet::ndarray::ElementwiseSum(rctx.get_stream(), rsc, reduce, &out); on_complete(); }, Context::CPU(), const_vars, {result.var(), rsc.var}, - FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreReduce")); + FnProperty::kCPUPrioritized, priority, "KVStoreReduce"); } return buf.merged; @@ -235,7 +235,7 @@ class CommCPU : public Comm { &temp); on_complete(); }, Context::CPU(), {src.var(), row_id.var()}, {retained_cpu.var()}, - FnProperty::kNormal, priority, PROFILER_MESSAGE("KVStoreSparseRetain")); + FnProperty::kNormal, priority, "KVStoreSparseRetain"); // if retained_cpu == out, CopyFromTo will ignore the copy operation CopyFromTo(retained_cpu, out, priority); } @@ -592,7 +592,7 @@ class CommDevice : public Comm { } on_complete(); }, out_gpu.ctx(), {src.var(), row_id.var()}, {out_gpu.var()}, - FnProperty::kNormal, priority, PROFILER_MESSAGE("KVStoreSparseRetain")); + FnProperty::kNormal, priority, "KVStoreSparseRetain"); CopyFromTo(out_gpu, out, priority); } } diff --git a/src/kvstore/gradient_compression.cc b/src/kvstore/gradient_compression.cc index b8c626cd53a8..e94a0570d1f4 100644 --- a/src/kvstore/gradient_compression.cc +++ b/src/kvstore/gradient_compression.cc @@ -129,7 +129,7 @@ void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *t std::vector inputs = {from.data(), residual->data(), to->data()}; Quantize2BitImpl(ctx.get_stream(), inputs, threshold); }, from.ctx(), {from.var()}, {to->var(), residual->var()}, - mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); + mxnet::FnProperty::kNormal, priority, "QuantizeCPU"); } else { #if MXNET_USE_CUDA if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { @@ -139,7 +139,7 @@ void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *t // Wait GPU kernel to complete ctx.get_stream()->Wait(); }, from.ctx(), {from.var()}, {to->var(), residual->var()}, - mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU")); + mxnet::FnProperty::kNormal, priority, "QuantizeGPU"); } else { LOG(FATAL) << "unknown device mask"; } @@ -165,7 +165,7 @@ void GradientCompression::Dequantize(const mxnet::NDArray &from, mxnet::NDArray std::vector inputs = {from.data(), to->data()}; Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); }, from.ctx(), {from.var()}, {to->var()}, - mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); + mxnet::FnProperty::kNormal, priority, "DequantizeCPU"); } else { #if MXNET_USE_CUDA if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { @@ -175,7 +175,7 @@ void GradientCompression::Dequantize(const mxnet::NDArray &from, mxnet::NDArray // Wait GPU kernel to complete ctx.get_stream()->Wait(); }, from.ctx(), {from.var()}, {to->var()}, - mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); + mxnet::FnProperty::kNormal, priority, "DequantizeGPU"); } else { LOG(FATAL) << "unknown device mask"; } diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 7ab5783f7fce..afba9ac5f274 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -250,7 +250,7 @@ class KVStoreDist : public KVStoreLocal { {recv_buf.var()}, FnProperty::kNormal, priority, - PROFILER_MESSAGE("KVStoreDistDefaultStoragePull")); + "KVStoreDistDefaultStoragePull"); comm_->Broadcast(key, recv_buf, grouped_vals[i], priority); } @@ -392,7 +392,7 @@ class KVStoreDist : public KVStoreLocal { {}, FnProperty::kNormal, priority, - PROFILER_MESSAGE("KVStoreDistCompressedPush")); + "KVStoreDistCompressedPush"); } void PushDefault(int key, const NDArray &send_buf, const PSKV& pskv, int priority) { @@ -414,7 +414,7 @@ class KVStoreDist : public KVStoreLocal { {}, FnProperty::kNormal, priority, - PROFILER_MESSAGE("KVStoreDistDefaultPush")); + "KVStoreDistDefaultPush"); } // push row sparse gradient @@ -447,7 +447,7 @@ class KVStoreDist : public KVStoreLocal { {}, FnProperty::kNormal, priority, - PROFILER_MESSAGE("KVStoreDistRowSparsePush")); + "KVStoreDistRowSparsePush"); } @@ -490,7 +490,7 @@ class KVStoreDist : public KVStoreLocal { {recv_buf.var()}, FnProperty::kNormal, priority, - PROFILER_MESSAGE("KVStoreDistRowSparsePull")); + "KVStoreDistRowSparsePull"); } /** diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index 69fb37ec5f39..3383c97f9262 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -404,7 +404,7 @@ class KVStoreLocal : public KVStore { } on_complete(); }, out.ctx(), {data_in_ctx.var()}, mutate_vars, - FnProperty::kNormal, priority, PROFILER_MESSAGE("KVStoreUnique")); + FnProperty::kNormal, priority, "KVStoreUnique"); return out; } diff --git a/src/kvstore/kvstore_nccl.h b/src/kvstore/kvstore_nccl.h index e97a6d7f7e55..95ee8147a153 100644 --- a/src/kvstore/kvstore_nccl.h +++ b/src/kvstore/kvstore_nccl.h @@ -309,7 +309,7 @@ class KVStoreNCCL : public KVStoreLocal { mutate_vars, FnProperty::kCPUPrioritized, priority, - PROFILER_MESSAGE("KVStoreReduce")); + "KVStoreReduce"); } virtual void Broadcast(const std::vector keys, @@ -413,7 +413,7 @@ class KVStoreNCCL : public KVStoreLocal { mutable_vars, FnProperty::kCPUPrioritized, priority, - PROFILER_MESSAGE("KVStoreBCast")); + "KVStoreBCast"); } // Function that waits for NCCL collective to complete @@ -434,7 +434,7 @@ class KVStoreNCCL : public KVStoreLocal { mutate_vars, FnProperty::kCPUPrioritized, priority, - PROFILER_MESSAGE("KVStoreStreamSync")); + "KVStoreStreamSync"); } // Initialize single key diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 52b96fad6929..7debfea14eea 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -562,7 +562,7 @@ void NDArray::Reorder2DefaultAsync() { tmp.ptr_->Reorder2Default(); on_complete(); }, ctx(), const_vars, mutable_vars, - FnProperty::kNormal, 0, PROFILER_MESSAGE("Reorder2Default")); + FnProperty::kNormal, 0, "Reorder2Default"); } void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::primitive_desc &desc) { @@ -574,7 +574,7 @@ void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::primitive_desc &desc) tmp.ptr_->MKLDNNDataReorder(desc); on_complete(); }, ctx(), const_vars, mutable_vars, - FnProperty::kNormal, 0, PROFILER_MESSAGE("Reorder")); + FnProperty::kNormal, 0, "Reorder"); } const mkldnn::memory *NDArray::GetMKLDNNData() const { @@ -1236,7 +1236,7 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority) { CopyFromToImpl(from, to, ctx, requested); on_complete(); }, from.ctx(), const_vars, mutable_vars, - FnProperty::kNormal, priority, PROFILER_MESSAGE("CopyCPU2CPU")); + FnProperty::kNormal, priority, "CopyCPU2CPU"); } else { #if MXNET_USE_CUDA if (a == cpu::kDevMask && b == gpu::kDevMask) { @@ -1246,7 +1246,7 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority) { ctx.get_stream()->Wait(); on_complete(); }, to.ctx(), const_vars, mutable_vars, - FnProperty::kCopyToGPU, priority, PROFILER_MESSAGE("CopyCPU2GPU")); + FnProperty::kCopyToGPU, priority, "CopyCPU2GPU"); } else if (a == gpu::kDevMask && b == cpu::kDevMask) { Engine::Get()->PushAsync( [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) { @@ -1254,7 +1254,7 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority) { ctx.get_stream()->Wait(); on_complete(); }, from.ctx(), const_vars, mutable_vars, - FnProperty::kCopyFromGPU, priority, PROFILER_MESSAGE("CopyGPU2CPU")); + FnProperty::kCopyFromGPU, priority, "CopyGPU2CPU"); } else if (a == gpu::kDevMask && b == gpu::kDevMask) { Engine::Get()->PushAsync( [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) { @@ -1263,7 +1263,7 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority) { on_complete(); }, from.ctx(), const_vars, mutable_vars, from.dtype() != to.dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU, - priority, PROFILER_MESSAGE("CopyGPU2GPU")); + priority, "CopyGPU2GPU"); } else { LOG(FATAL) << "unknown device mask"; } @@ -1326,7 +1326,7 @@ void ElementwiseSum(const std::vector &source, NDArray *out, int priori // Wait GPU kernel to complete ctx.get_stream()->Wait(); }, out->ctx(), const_vars, {ret.var()}, - FnProperty::kNormal, priority, PROFILER_MESSAGE("DenseElementwiseSum")); + FnProperty::kNormal, priority, "DenseElementwiseSum"); break; } #endif @@ -1355,7 +1355,7 @@ void ElementwiseSum(const std::vector &source, NDArray *out, int priori default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; } }, ret.ctx(), const_vars, {ret.var(), rsc.var}, - FnProperty::kNormal, priority, PROFILER_MESSAGE("RowSparseElementwiseSum")); + FnProperty::kNormal, priority, "RowSparseElementwiseSum"); } else { LOG(FATAL) << "Not implemented for storage_type " << common::stype_string(stype); } @@ -1831,7 +1831,7 @@ void NDArray::SyncCopyFromCPU(const void *data, size_t size) const { rctx.get_stream()->Wait(); on_complete(); }, this->ctx(), {}, {this->var()}, - FnProperty::kCopyToGPU, 0, PROFILER_MESSAGE("SyncCopyCPU2GPU")); + FnProperty::kCopyToGPU, 0, "SyncCopyCPU2GPU"); this->WaitToRead(); #else LOG(FATAL) << "GPU is not enabled"; @@ -1888,7 +1888,7 @@ void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) { TBlob dst_data = get_dst_data(src_data.shape_); ndarray::Copy(src_data, &dst_data, src.ctx(), this->ctx(), rctx); }, this->ctx(), const_vars, {this->var()}, - FnProperty::kNormal, 0, PROFILER_MESSAGE("SyncCopyFromNDArrayCPU2CPU")); + FnProperty::kNormal, 0, "SyncCopyFromNDArrayCPU2CPU"); } else { #if MXNET_USE_CUDA if (src_dev_mask == cpu::kDevMask && dst_dev_mask == gpu::kDevMask) { @@ -1900,7 +1900,7 @@ void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) { rctx.get_stream()->Wait(); on_complete(); }, this->ctx(), const_vars, {this->var()}, - FnProperty::kCopyToGPU, 0, PROFILER_MESSAGE("SyncCopyFromNDArrayCPU2GPU")); + FnProperty::kCopyToGPU, 0, "SyncCopyFromNDArrayCPU2GPU"); } else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == cpu::kDevMask) { Engine::Get()->PushAsync( [&](RunContext rctx, Engine::CallbackOnComplete on_complete) { @@ -1910,7 +1910,7 @@ void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) { rctx.get_stream()->Wait(); on_complete(); }, this->ctx(), const_vars, {this->var()}, - FnProperty::kCopyFromGPU, 0, PROFILER_MESSAGE("SyncCopyFromNDArrayGPU2CPU")); + FnProperty::kCopyFromGPU, 0, "SyncCopyFromNDArrayGPU2CPU"); } else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == gpu::kDevMask) { Engine::Get()->PushAsync( [&](RunContext rctx, Engine::CallbackOnComplete on_complete) { @@ -1921,7 +1921,7 @@ void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) { on_complete(); }, this->ctx(), const_vars, {this->var()}, src.dtype() != this->dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU, - 0, PROFILER_MESSAGE("SyncCopyFromNDArrayGPU2GPU")); + 0, "SyncCopyFromNDArrayGPU2GPU"); } else { LOG(FATAL) << "unknown device mask"; } @@ -1966,7 +1966,7 @@ void NDArray::SyncCopyToCPU(void *data, size_t size) const { rctx.get_stream()->Wait(); on_complete(); }, this->ctx(), {this->var()}, {}, - FnProperty::kCopyFromGPU, 0, PROFILER_MESSAGE("SyncCopyGPU2CPU")); + FnProperty::kCopyFromGPU, 0, "SyncCopyGPU2CPU"); this->WaitToWrite(); #else LOG(FATAL) << "GPU is not enabled"; @@ -1981,14 +1981,14 @@ void NDArray::SyncCheckFormat(const bool full_check) const { Engine::Get()->PushSync([&](RunContext rctx) { common::CheckFormatWrapper(rctx, *this, err_cpu, full_check); }, this->ctx(), {this->var()}, {}, - FnProperty::kNormal, 0, PROFILER_MESSAGE("CheckFormat")); + FnProperty::kNormal, 0, "CheckFormat"); } else { #if MXNET_USE_CUDA Engine::Get()->PushSync([&](RunContext rctx) { common::CheckFormatWrapper(rctx, *this, err_cpu, full_check); rctx.get_stream()->Wait(); }, this->ctx(), {this->var()}, {}, - FnProperty::kNormal, 0, PROFILER_MESSAGE("CheckFormat")); + FnProperty::kNormal, 0, "CheckFormat"); #else LOG(FATAL) << "GPU is not enabled"; #endif diff --git a/src/operator/custom/custom-inl.h b/src/operator/custom/custom-inl.h index 38aeefd66a4e..13c8d3434b18 100644 --- a/src/operator/custom/custom-inl.h +++ b/src/operator/custom/custom-inl.h @@ -91,7 +91,7 @@ class CustomOperator { Engine::Get()->PushSync([=](RunContext rctx) { ctx.async_on_complete(); }, ctx.run_ctx.ctx, vars, {}, - FnProperty::kNormal, 0, PROFILER_MESSAGE("CustomOperator")); + FnProperty::kNormal, 0, "CustomOperator"); }); cv_.notify_all(); } diff --git a/src/operator/custom/ndarray_op.cc b/src/operator/custom/ndarray_op.cc index 9ad0d09e3b0d..396c43555dc8 100644 --- a/src/operator/custom/ndarray_op.cc +++ b/src/operator/custom/ndarray_op.cc @@ -89,7 +89,7 @@ void NDArrayOp::Forward(const OpContext &ctx, [ndcpy, ctx](RunContext rctx, Engine::CallbackOnComplete on_complete) { ctx.async_on_complete(); on_complete(); - }, ndctx, ndvar, {}, FnProperty::kNormal, 0, PROFILER_MESSAGE("NDArrayOpForward")); + }, ndctx, ndvar, {}, FnProperty::kNormal, 0, "NDArrayOpForward"); } template @@ -138,7 +138,7 @@ void NDArrayOp::Backward(const OpContext &ctx, [ndcpy, ctx](RunContext rctx, Engine::CallbackOnComplete on_complete){ ctx.async_on_complete(); on_complete(); - }, ndctx, ndvar, {}, FnProperty::kNormal, 0, PROFILER_MESSAGE("NDArrayOpBackward")); + }, ndctx, ndvar, {}, FnProperty::kNormal, 0, "NDArrayOpBackward"); } Operator* NDArrayOpProp::CreateOperator(Context ctx) const { diff --git a/src/operator/operator_util.cc b/src/operator/operator_util.cc index bae3cb6a2964..326a1ca38ba3 100644 --- a/src/operator/operator_util.cc +++ b/src/operator/operator_util.cc @@ -506,7 +506,7 @@ void SimpleOpRegEntryImpl::RegisterSourceImperative() { } #endif }, ret.ctx(), {}, write_vars, - FnProperty::kNormal, 0, PROFILER_MESSAGE("RegisterSourceImperative")); + FnProperty::kNormal, 0, "RegisterSourceImperative"); }; // register the function. NDArrayReg() @@ -690,7 +690,7 @@ void SimpleOpRegEntryImpl::RegisterUnaryImperative() { } #endif }, src.ctx(), const_vars, write_vars, - FnProperty::kNormal, 0, PROFILER_MESSAGE("RegisterUnaryImperative")); + FnProperty::kNormal, 0, "RegisterUnaryImperative"); }; // register the function. NDArrayReg() @@ -964,7 +964,7 @@ void SimpleOpRegEntryImpl::RegisterBinaryImperative() { } #endif }, lhs.ctx(), const_vars, write_vars, - FnProperty::kNormal, 0, PROFILER_MESSAGE("RegisterBinaryImperative")); + FnProperty::kNormal, 0, "RegisterBinaryImperative"); }; // register the function. NDArrayReg() diff --git a/src/optimizer/sgd-inl.h b/src/optimizer/sgd-inl.h index 3c0224d28070..12738f8e4053 100644 --- a/src/optimizer/sgd-inl.h +++ b/src/optimizer/sgd-inl.h @@ -142,12 +142,12 @@ class SGDOpt : public Optimizer { Engine::Get()->PushSync([this, index, w, g, lr, wd](RunContext ctx) { call_sgd_mom_update_cpu(ctx, w.data(), g.data(), mom[index].data(), lr, wd, param_); }, w.ctx(), {g.var()}, {w.var(), mom[index].var()}, - FnProperty::kNormal, 0, PROFILER_MESSAGE("SGDOptUpdate")); + FnProperty::kNormal, 0, "SGDOptUpdate"); } else { Engine::Get()->PushSync([this, index, w, g, lr, wd](RunContext ctx) { call_sgd_update_cpu(ctx, w.data(), g.data(), lr, wd, param_); }, w.ctx(), {g.var()}, {w.var()}, - FnProperty::kNormal, 0, PROFILER_MESSAGE("SGDOptUpdate")); + FnProperty::kNormal, 0, "SGDOptUpdate"); } break; case Context::kGPU: @@ -156,12 +156,12 @@ class SGDOpt : public Optimizer { Engine::Get()->PushSync([this, index, w, g, lr, wd](RunContext ctx) { call_sgd_mom_update_gpu(ctx, w.data(), g.data(), mom[index].data(), lr, wd, param_); }, w.ctx(), {g.var()}, {w.var(), mom[index].var()}, - FnProperty::kNormal, 0, PROFILER_MESSAGE("SGDOptUpdate")); + FnProperty::kNormal, 0, "SGDOptUpdate"); } else { Engine::Get()->PushSync([this, index, w, g, lr, wd](RunContext ctx) { call_sgd_update_gpu(ctx, w.data(), g.data(), lr, wd, param_); }, w.ctx(), {g.var()}, {w.var()}, - FnProperty::kNormal, 0, PROFILER_MESSAGE("SGDOptUpdate")); + FnProperty::kNormal, 0, "SGDOptUpdate"); } break; #else diff --git a/src/profiler/profiler.cc b/src/profiler/profiler.cc index e6dafb3d0916..f2d14cf2729a 100644 --- a/src/profiler/profiler.cc +++ b/src/profiler/profiler.cc @@ -98,7 +98,6 @@ Profiler::~Profiler() { } Profiler* Profiler::Get(std::shared_ptr *sp) { -#if MXNET_USE_PROFILER static std::mutex mtx; static std::shared_ptr prof = nullptr; if (!prof) { @@ -111,9 +110,6 @@ Profiler* Profiler::Get(std::shared_ptr *sp) { *sp = prof; } return prof.get(); -#else - return nullptr; -#endif } void Profiler::SetState(ProfilerState state) { diff --git a/src/profiler/storage_profiler.h b/src/profiler/storage_profiler.h index b9a7e01e8463..bcbe7e7e3ffd 100644 --- a/src/profiler/storage_profiler.h +++ b/src/profiler/storage_profiler.h @@ -27,7 +27,6 @@ namespace mxnet { namespace storage { -#if MXNET_USE_PROFILER /*! * \brief Storage allocation/deallocation profiling via ProfileCounters */ @@ -103,8 +102,6 @@ class DeviceStorageProfiler { std::vector> mem_counters_; }; -#endif // MXNET_USE_PROFILER - } // namespace storage } // namespace mxnet diff --git a/src/resource.cc b/src/resource.cc index c2b260985a5f..18927f0cd337 100644 --- a/src/resource.cc +++ b/src/resource.cc @@ -211,7 +211,7 @@ class ResourceManagerImpl : public ResourceManager { r->Seed(seed); on_complete(); }, ctx, {}, {resource.var}, - FnProperty::kNormal, 0, PROFILER_MESSAGE("ResourceRandomSetSeed")); + FnProperty::kNormal, 0, "ResourceRandomSetSeed"); } }; @@ -284,7 +284,7 @@ class ResourceManagerImpl : public ResourceManager { common::random::RandGenerator::AllocState(r); r->Seed(rctx.get_stream(), seed); }, ctx, {}, {resource[i].var}, - FnProperty::kNormal, 0, PROFILER_MESSAGE("ResourceParallelRandomSetSeed")); + FnProperty::kNormal, 0, "ResourceParallelRandomSetSeed"); sampler[i] = r; resource[i].ptr_ = sampler[i]; resource[i].req = ResourceRequest(ResourceRequest::kParallelRandom); @@ -310,7 +310,7 @@ class ResourceManagerImpl : public ResourceManager { r->Seed(rctx.get_stream(), seed); on_complete(); }, ctx, {}, {resource[i].var}, - FnProperty::kNormal, 0, PROFILER_MESSAGE("ResourceNativeRandomSetSeed")); + FnProperty::kNormal, 0, "ResourceNativeRandomSetSeed"); } // reset pointer to ensure the same result with the same seed. curr_ptr.store(0); diff --git a/src/storage/storage.cc b/src/storage/storage.cc index a4bccec0a048..44ae36594703 100644 --- a/src/storage/storage.cc +++ b/src/storage/storage.cc @@ -74,9 +74,7 @@ class StorageImpl : public Storage { // internal storage managers std::array, kMaxNumberOfDevices> storage_managers_; -#if MXNET_USE_PROFILER storage::DeviceStorageProfiler profiler_; -#endif // MXNET_USE_PROFILER }; // struct Storage::Impl #if MXNET_USE_CUDA int StorageImpl::num_gpu_device = 0; @@ -133,9 +131,7 @@ void StorageImpl::Alloc(Storage::Handle* handle) { this->ActivateDevice(handle->ctx); manager->Alloc(handle); -#if MXNET_USE_PROFILER profiler_.OnAlloc(*handle); -#endif // MXNET_USE_PROFILER } void StorageImpl::Free(Storage::Handle handle) { @@ -148,9 +144,7 @@ void StorageImpl::Free(Storage::Handle handle) { }); this->ActivateDevice(ctx); manager->Free(handle); -#if MXNET_USE_PROFILER profiler_.OnFree(handle); -#endif // MXNET_USE_PROFILER } void StorageImpl::DirectFree(Storage::Handle handle) { @@ -163,9 +157,7 @@ void StorageImpl::DirectFree(Storage::Handle handle) { }); this->ActivateDevice(ctx); manager->DirectFree(handle); -#if MXNET_USE_PROFILER profiler_.OnFree(handle); -#endif // MXNET_USE_PROFILER } void StorageImpl::SharedIncrementRefCount(Storage::Handle handle) {