From 2ac53b106751629c41dc0bb6844db9d6c3685783 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Thu, 1 Apr 2021 09:34:33 -0700 Subject: [PATCH] [Profiling,VM] Profiling interface for VM and Graph Runtime (#7624) * [Profiling,VM] Profiling interface for VM and Graph Runtime * lint * fix test * make profiling test actually run * Try to better match the graph runtime function names to vm * formatting * DurationNode.value -> microseconds; PercentNode.value -> percent; make frame sorting optional. * renaming for the tvmcontext -> device change * formatting * remove old vm profiler get_stat api * fix tests --- include/tvm/runtime/profiling.h | 141 +++++++++ python/tvm/contrib/debugger/debug_executor.py | 19 ++ python/tvm/runtime/profiler_vm.py | 36 ++- .../debug/graph_executor_debug.cc | 46 +++ src/runtime/graph_executor/graph_executor.cc | 2 + src/runtime/profiling.cc | 271 ++++++++++++++++++ src/runtime/vm/profiler/vm.cc | 112 +++----- src/runtime/vm/profiler/vm.h | 3 +- tests/cpp/{profiling.cc => profiling_test.cc} | 0 .../python/unittest/test_runtime_profiling.py | 51 ++++ .../unittest/test_runtime_vm_profiler.py | 22 +- 11 files changed, 612 insertions(+), 91 deletions(-) rename tests/cpp/{profiling.cc => profiling_test.cc} (100%) create mode 100644 tests/python/unittest/test_runtime_profiling.py diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h index fa16e7820b352..912ca26ec0f1e 100644 --- a/include/tvm/runtime/profiling.h +++ b/include/tvm/runtime/profiling.h @@ -27,9 +27,14 @@ #include #include #include +#include #include +#include #include +#include +#include +#include namespace tvm { namespace runtime { @@ -144,6 +149,142 @@ class Timer : public ObjectRef { */ Timer DefaultTimer(Device dev); +namespace profiling { + +/*! Information about a single function or operator call. */ +struct CallFrame { + /*! Device on which the call was made */ + Device dev; + /*! Name of the function or op */ + String name; + /*! Runtime of the function or op */ + Timer timer; + /*! Extra performance metrics */ + std::unordered_map extra_metrics; +}; + +/*! Runtime profiler for function and/or operator calls. Used in the graph + * runtime and VM to provide profiling information for all operators. + * + * Example usage: + * \code{.cpp} + * Profiler prof; + * Device cpu, gpu; + * prof.Start({cpu, gpu}); + * prof.StartCall("my_gpu_kernel", gpu); + * my_gpu_kernel(); + * prof.StopCall(); + * prof.StartCall("my_cpu_function", cpu); + * my_cpu_function(); + * prof.StopCall(); + * prof.Stop(); + * std::cout << prof.Report << std::endl; // print profiling report + * \endcode + */ +class Profiler { + public: + /*! \brief Start the profiler. + * \param devs The list of devices the profiler will be running on. Should + * include all devices used by profiled operators. + * + * This function should only be called once per object. + */ + void Start(const std::vector& devs); + /*! \brief Stop the profiler. + * + * This function should only be called once per object after start has been called. + */ + void Stop(); + /*! \brief Start a function call. + * \param name The name of the function being called. + * \param dev The device on which the function is running. + * \param extra_metrics Optional additional profiling information to add to + * the frame (input sizes, allocations). + * + * `StartCall` may be nested, but each `StartCall` needs a matching + * `StopCall`. Function calls are stopped in LIFO order, so calls to + * `StartCall` and `StopCall` must be nested properly. + */ + void StartCall(String name, Device dev, + std::unordered_map extra_metrics = {}); + /*! \brief Stop the last `StartCall`. + * \param extra_metrics Optional additional profiling information to add to + * the frame (input sizes, allocations). + */ + void StopCall(std::unordered_map extra_metrics = {}); + /*! \brief A textual report of total runtime between `Start` and `Stop` as + * well as individual statistics for each `StartCall`-`StopCall` pair. + * \param aggregate Whether or not to join multiple calls to the same op into a single line. + * \param sort Whether or not to sort call frames by descending duration. If + * false and if `aggregate` is false, frames will be sorted by order of + * appearance in the program. Order is undefined if `sort` is false and + * `aggregate` is true. + * \returns The report as a string. + */ + String Report(bool aggregate = true, bool sort = true); + /*! \brief Check if the profiler is currently running. + * \returns Whether or not the profiler is running. + */ + bool IsRunning() const { return !global_timers_.empty(); } + + private: + std::vector> global_timers_; + std::vector calls_; + std::stack in_flight_; +}; + +/* \brief A duration in time. */ +class DurationNode : public Object { + public: + /* The duration as a floating point number of microseconds. */ + double microseconds; + + /* \brief Construct a new duration. + * \param a The duration in microseconds. + */ + explicit DurationNode(double a) : microseconds(a) {} + + static constexpr const char* _type_key = "runtime.profiling.Duration"; + TVM_DECLARE_FINAL_OBJECT_INFO(DurationNode, Object); +}; + +/* A percentage of something */ +class PercentNode : public Object { + public: + /* The percent as a floating point value out of 100%. i.e. if `percent` is 10 then we have 10%. */ + double percent; + + /* \brief Construct a new percentage. + * \param a The percentage out of 100. + */ + explicit PercentNode(double a) : percent(a) {} + + static constexpr const char* _type_key = "runtime.profiling.Percent"; + TVM_DECLARE_FINAL_OBJECT_INFO(PercentNode, Object); +}; + +/* A count of something */ +class CountNode : public Object { + public: + /* The actual count */ + int64_t value; + + /* \brief Construct a new count. + * \param a The count. + */ + explicit CountNode(int64_t a) : value(a) {} + + static constexpr const char* _type_key = "runtime.profiling.Count"; + TVM_DECLARE_FINAL_OBJECT_INFO(CountNode, Object); +}; + +/*! \brief String representation of an array or NDArray shapes + * \param shapes Array of NDArrays to get the shapes of. + * \return A textual representation of the shapes. For example: `float32[2], int64[1, 2]`. + */ +String ShapeString(const std::vector& shapes); + +} // namespace profiling } // namespace runtime } // namespace tvm diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py index b27ae6533e385..1a1696baac4a3 100644 --- a/python/tvm/contrib/debugger/debug_executor.py +++ b/python/tvm/contrib/debugger/debug_executor.py @@ -100,6 +100,7 @@ def __init__(self, module, device, graph_json_str, dump_root): self._dump_path = None self._get_output_by_layer = module["get_output_by_layer"] self._run_individual = module["run_individual"] + self._profile = module["profile"] graph_executor.GraphModule.__init__(self, module) self._create_debug_env(graph_json_str, device) @@ -234,6 +235,24 @@ def run_individual(self, number, repeat=1, min_repeat_ms=0): ret = self._run_individual(number, repeat, min_repeat_ms) return ret.strip(",").split(",") if ret else [] + def profile(self, **input_dict): + """Run forward execution of the graph and collect overall and per-op + performance metrics. + + Parameters + ---------- + input_dict : dict of str to NDArray + List of input values to be feed to + Return + ------ + timing_results : str + Per-operator and whole graph timing results in a table format. + """ + if input_dict: + self.set_input(**input_dict) + + return self._profile() + def exit(self): """Exits the dump folder and all its contents""" self._remove_dump_root() diff --git a/python/tvm/runtime/profiler_vm.py b/python/tvm/runtime/profiler_vm.py index e70ef320289dd..e1c3dc66a360d 100644 --- a/python/tvm/runtime/profiler_vm.py +++ b/python/tvm/runtime/profiler_vm.py @@ -20,6 +20,7 @@ Provides extra APIs for profiling vm execution. """ +import warnings from tvm.runtime import _ffi_api from . import vm @@ -37,26 +38,37 @@ def __init__(self, exe, device, memory_cfg=None): self.module = _ffi_api._VirtualMachineDebug(exe.module) self._init = self.module["init"] self._invoke = self.module["invoke"] - self._get_stat = self.module["get_stat"] + self._profile = self.module["profile"] self._set_input = self.module["set_input"] - self._reset = self.module["reset"] self._setup_device(device, memory_cfg) - def get_stat(self, sort_by_time=True): + def get_stat(self, sort_by_time=True): # pylint: disable=unused-argument """Get the statistics of executed ops. + REMOVED, use profile method instead. + """ + warnings.warn("get_stat has been removed, use profile instead") + return "" + + def profile(self, *args, func_name="main", **kwargs): + """Profile a function call. + Parameters ---------- - sort_by_time: Optional[Boolean] - Set to indicate the returned results are sorted by execution time in - the descending order. It is printed in the random order if this - field is not set. + func_name : str + The name of the function. + + args : list[tvm.runtime.NDArray] or list[np.ndarray] + The arguments to the function. + + kwargs: dict of str to tvm.runtime.NDArray or np.ndarray + Named arguments to the function. Returns ------- - The execution statistics in string. + timing_results : str + Overall and per-op timing results formatted in a table. """ - return self._get_stat(sort_by_time) - - def reset(self): - self._reset() + if args or kwargs: + self.set_input(func_name, *args, **kwargs) + return self._profile(func_name) diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc index 7c1e6960f9f53..ef3f9553fab7f 100644 --- a/src/runtime/graph_executor/debug/graph_executor_debug.cc +++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc @@ -228,6 +228,50 @@ class GraphExecutorDebug : public GraphExecutor { data_entry_[eid].CopyTo(data_out); } + + /*! + * \brief Profile execution time of the module. + * + * We run the entire module while recording overall and per-op timing + * information. The module may be run multiple times to ensure everything is + * warmed up. This function is a more correct reflection of actual runtime of + * the module compared to GraphRuntimeDebug::RunIndividual as it runs the + * entire graph in order. + * + * \returns A table of per-op runtimes and total times. + */ + String Profile() { + // warm up. 1 iteration does not seem enough. + for (int i = 0; i < 3; i++) { + GraphExecutor::Run(); + } + + profiling::Profiler prof; + prof.Start(devices_); + for (size_t i = 0; i < op_execs_.size(); ++i) { + if (op_execs_[i]) { + // get argument shapes + std::vector shapes; + for (const auto& e : nodes_[i].inputs) { + uint32_t eid = entry_id(e); + shapes.push_back(data_entry_[eid]); + } + for (uint32_t j = 0; j < nodes_[i].param.num_outputs; ++j) { + uint32_t eid = entry_id(i, j); + shapes.push_back(data_entry_[eid]); + } + + uint32_t eid = entry_id(i, 0); + const Device& device = data_entry_[eid]->device; + prof.StartCall(nodes_[i].param.func_name, device, + {{"Argument Shapes", profiling::ShapeString(shapes)}}); + op_execs_[i](); + prof.StopCall(); + } + } + prof.Stop(); + return prof.Report(); + } }; /*! @@ -260,6 +304,8 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name, ICHECK_GE(min_repeat_ms, 0); *rv = this->RunIndividual(number, repeat, min_repeat_ms); }); + } else if (name == "profile") { + return TypedPackedFunc([sptr_to_self, this]() { return this->Profile(); }); } else { return GraphExecutor::GetFunction(name, sptr_to_self); } diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc index f33dc9f175b7c..e4e2b5201d89b 100644 --- a/src/runtime/graph_executor/graph_executor.cc +++ b/src/runtime/graph_executor/graph_executor.cc @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -59,6 +60,7 @@ void GraphExecutor::Run() { if (op_execs_[i]) op_execs_[i](); } } + /*! * \brief Initialize the graph executor with graph and device. * \param graph_json The execution graph. diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc index 6cf1034a3837b..d68e5334c8a60 100644 --- a/src/runtime/profiling.cc +++ b/src/runtime/profiling.cc @@ -22,11 +22,15 @@ * \brief Runtime profiling including timers. */ +#include #include #include #include +#include +#include #include +#include namespace tvm { namespace runtime { @@ -93,5 +97,272 @@ Timer Timer::Start(Device dev) { } TVM_REGISTER_GLOBAL("profiling.start_timer").set_body_typed(Timer::Start); + +namespace profiling { + +void Profiler::Start(const std::vector& devs) { + CHECK(global_timers_.empty()) << "You can only call Start once per Profiler."; + for (auto dev : devs) { + global_timers_.emplace_back(dev, Timer::Start(dev)); + } +} + +void Profiler::StartCall(String name, Device dev, + std::unordered_map extra_metrics) { + in_flight_.push(CallFrame{dev, name, Timer::Start(dev), extra_metrics}); +} + +void Profiler::StopCall(std::unordered_map extra_metrics) { + CallFrame cf = in_flight_.top(); + cf.timer->Stop(); + for (auto& p : extra_metrics) { + cf.extra_metrics[p.first] = p.second; + } + in_flight_.pop(); + calls_.push_back(cf); +} + +void Profiler::Stop() { + // Stop all global timers. We wait to synchronize until we are making the report. + for (auto p : global_timers_) { + p.second->Stop(); + } +} + +String ShapeString(const std::vector& shapes) { + std::stringstream sizes; + for (const NDArray& ary : shapes) { + if (sizes.tellp() > 0) { + sizes << ", "; + } + auto shape = ary.Shape(); + sizes << ary.DataType() << "["; + for (size_t i = 0; i < shape.size(); i++) { + if (i != 0) { + sizes << ", "; + } + sizes << shape[i]; + } + sizes << "]"; + } + return String(sizes.str()); +} + +std::string FormatTable(const std::vector>& rows, + std::unordered_set hidden_cols = {"Argument Shapes", + "Device"}) { + std::unordered_set unique_headers; + + for (auto row : rows) { + for (auto p : row) { + unique_headers.insert(p.first); + } + } + + std::vector headers = {"Name", "Duration (us)", "Percent"}; + for (auto header : unique_headers) { + if (header != "Name" && header != "Duration (us)" && header != "Percent" && + hidden_cols.find(header) == hidden_cols.end()) { + headers.push_back(header); + } + } + + std::vector> cols; + for (auto header : headers) { + cols.push_back({header}); + } + for (auto row : rows) { + for (size_t i = 0; i < headers.size(); i++) { + auto it = row.find(headers[i]); + if (it == row.end()) { + cols[i].push_back(""); + } else { + std::string val; + if (it->second.as()) { + std::stringstream s; + s.imbue(std::locale("")); // for 1000s seperators + s << std::fixed << it->second.as()->value; + val = s.str(); + } else if (it->second.as()) { + std::stringstream s; + s.imbue(std::locale("")); // for 1000s seperators + s << std::fixed << std::setprecision(2) << it->second.as()->microseconds; + val = s.str(); + } else if (it->second.as()) { + std::stringstream s; + s << std::fixed << std::setprecision(2) << it->second.as()->percent; + val = s.str(); + } else if (it->second.as()) { + val = Downcast(it->second); + } + cols[i].push_back(val); + } + } + } + + std::vector widths; + for (auto v : cols) { + size_t width = 0; + for (auto x : v) { + width = std::max(width, x.size()); + } + widths.push_back(width); + } + size_t length = 0; + for (auto v : cols) { + length = std::max(length, v.size()); + } + + std::stringstream s; + for (size_t row = 0; row < length; row++) { + for (size_t col = 0; col < cols.size(); col++) { + // left align first column + if (col == 0) { + s << std::left; + } else { + s << std::right; + } + if (row < cols[col].size()) { + s << std::setw(widths[col]) << cols[col][row] << " "; + } else { + s << std::setw(widths[col]) << "" + << " "; + } + } + s << std::endl; + } + return s.str(); +} + +String Profiler::Report(bool aggregate, bool sort) { + std::vector> global_times; + for (auto p : global_timers_) { + global_times.emplace_back(p.first, p.second->SyncAndGetElapsedNanos() / 1e3); + } + double overall_time = 0.; + for (auto p : global_times) { + overall_time = std::max(overall_time, p.second); + } + + // aggregate times by op name + std::vector>> aggregate_rows; + if (aggregate) { + std::unordered_map> aggregates; + for (size_t i = 0; i < calls_.size(); i++) { + CallFrame& cf = calls_[i]; + std::string name = cf.name; + // don't aggregate dynamic ops with different shapes + auto it = cf.extra_metrics.find("Argument Shapes"); + if (it != cf.extra_metrics.end()) { + name = name + Downcast(it->second); + } + + if (aggregates.find(name) == aggregates.end()) { + aggregates[name] = {i}; + } else { + aggregates[name].push_back(i); + } + } + for (const auto& p : aggregates) { + aggregate_rows.push_back(p); + } + } else { + for (size_t i = 0; i < calls_.size(); i++) { + aggregate_rows.push_back({calls_[i].name, {i}}); + } + } + + // aggregated rows (poor man's dataframe) + std::vector> rows; + + // form aggregates and compute aggregate statistics (sum). + for (auto p : aggregate_rows) { + std::unordered_map row; + double time_sum = 0; + size_t count = 0; + for (auto i : p.second) { + double us = calls_[i].timer->SyncAndGetElapsedNanos() / 1e3; + time_sum += us; + count += 1; + } + row["Percent"] = ObjectRef(make_object(time_sum / overall_time * 100)); + row["Duration (us)"] = ObjectRef(make_object(time_sum)); + row["Count"] = ObjectRef(make_object(count)); + row["Name"] = calls_[p.second[0]].name; + Device dev = calls_[p.second[0]].dev; + row["Device"] = String(DeviceName(dev.device_type) + std::to_string(dev.device_id)); + + // assume all rows in the aggregate have the same metrics + for (auto metric : calls_[p.second[0]].extra_metrics) { + if (metric.second.as()) { + int64_t sum = 0; + for (auto i : p.second) { + sum += calls_[i].extra_metrics[metric.first].as()->value; + } + row[metric.first] = ObjectRef(make_object(sum)); + } else if (metric.second.as()) { + double sum = 0; + for (auto i : p.second) { + sum += calls_[i].extra_metrics[metric.first].as()->microseconds; + } + row[metric.first] = ObjectRef(make_object(sum)); + } else if (metric.second.as()) { + double sum = 0; + for (auto i : p.second) { + sum += calls_[i].extra_metrics[metric.first].as()->percent; + } + row[metric.first] = ObjectRef(make_object(sum)); + } else if (metric.second.as()) { + // assume all rows contain the same value for this metric + row[metric.first] = Downcast(metric.second); + } + } + + rows.push_back(row); + } + + // sort rows by duration + if (sort) { + std::sort(rows.begin(), rows.end(), + [&](const std::unordered_map& a, + const std::unordered_map& b) { + return a.at("Duration (us)").as()->microseconds > + b.at("Duration (us)").as()->microseconds; + }); + } + + double op_sum = 0; + int64_t total_count = 0; + double per = 0; + for (auto row : rows) { + op_sum += row["Duration (us)"].as()->microseconds; + total_count += row["Count"].as()->value; + per += row["Percent"].as()->percent; + } + + rows.push_back({{"Name", String("------------------")}}); + rows.push_back({{"Name", String("Total")}, + {"Duration (us)", ObjectRef(make_object(op_sum))}, + {"Count", ObjectRef(make_object(total_count))}, + {"Percent", ObjectRef(make_object(per))}}); + + std::stringstream s; + s.imbue(std::locale("")); + s << FormatTable(rows); + s << std::fixed << std::setprecision(2); + for (auto p : global_times) { + s << "Total time " << DeviceName(p.first.device_type) << p.first.device_id << ": " << p.second + << "us" << std::endl; + } + s << "Overhead: " << overall_time - op_sum << "us " + << (overall_time - op_sum) / overall_time * 100 << "% (Time not spent in operators)"; + + return s.str(); +} + +TVM_REGISTER_OBJECT_TYPE(DurationNode); +TVM_REGISTER_OBJECT_TYPE(PercentNode); +TVM_REGISTER_OBJECT_TYPE(CountNode); +} // namespace profiling } // namespace runtime } // namespace tvm diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc index 3f3dee437cb35..2c921b7af0030 100644 --- a/src/runtime/vm/profiler/vm.cc +++ b/src/runtime/vm/profiler/vm.cc @@ -41,60 +41,26 @@ namespace vm { PackedFunc VirtualMachineDebug::GetFunction(const std::string& name, const ObjectPtr& sptr_to_self) { - if (name == "get_stat") { - return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { - ICHECK_EQ(args.size(), 1U); - std::vector> op_acc_time; - std::unordered_map> op_durations; - for (auto kv : op_timers_) { - std::vector durations_us; - for (auto t : kv.second) { - durations_us.push_back(t->SyncAndGetElapsedNanos() / 1e3); + if (name == "profile") { + return TypedPackedFunc([sptr_to_self, this](String arg_name) { + std::vector devices; + for (auto dev : devices_) { + if (dev.device_type > 0) { + devices.push_back(dev); } - op_durations[kv.first] = durations_us; } - for (auto kv : op_durations) { - auto val = - std::make_pair(kv.first, std::accumulate(kv.second.begin(), kv.second.end(), 0.0)); - op_acc_time.push_back(val); - } - bool sort_by_time = args[0]; - if (sort_by_time) { - auto comp = [](const std::pair& lhs, const std::pair& rhs) { - return lhs.second > rhs.second; - }; - std::sort(op_acc_time.begin(), op_acc_time.end(), comp); - } - double total_duration = 0.0; - int64_t total_packed_funcs = 0; - std::ostringstream os; - os << std::setw(30) << std::left << "#OpName" - << "\t" << std::setw(10) << std::left << "#InvokeCount" - << "\t" - << "#Duration(us): Sum/Mean/Min/Max" << std::endl; - - for (auto kv : op_acc_time) { - auto vals = op_durations[kv.first]; - auto sum = kv.second; - auto mean = sum / static_cast(vals.size()); - auto min_value = *std::min_element(vals.begin(), vals.end()); - auto max_value = *std::max_element(vals.begin(), vals.end()); - os << std::setw(30) << std::left << packed_index_map_[kv.first] << "\t" << std::setw(10) - << std::left << op_invokes_[kv.first] << "\t" << sum << "/" << mean << "/" << min_value - << "/" << max_value << std::endl; - - total_duration += sum; - total_packed_funcs += op_invokes_[kv.first]; + auto invoke = VirtualMachine::GetFunction("invoke", sptr_to_self); + // warmup + for (int i = 0; i < 3; i++) { + invoke(arg_name); } - os << "\nTotal Duration: " << total_duration << " us.\t" - << "Total Packed Functions: " << total_packed_funcs << std::endl; - *rv = os.str(); - }); - } else if (name == "reset") { - return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { - op_timers_.clear(); - op_invokes_.clear(); + + prof_ = profiling::Profiler(); // reset profiler + prof_.Start(devices); + invoke(arg_name); + prof_.Stop(); + return prof_.Report(); }); } else { return VirtualMachine::GetFunction(name, sptr_to_self); @@ -106,7 +72,6 @@ void VirtualMachineDebug::LoadExecutable(const Executable* exec) { ICHECK(exec_); for (auto kv : exec_->primitive_map) { packed_index_map_[kv.second] = kv.first; - op_invokes_[kv.second] = 0; } } @@ -114,23 +79,38 @@ void VirtualMachineDebug::InvokePacked(Index packed_index, const PackedFunc& fun Index output_size, const std::vector& args) { ICHECK(exec_); ICHECK(!devices_.empty()) << "Device has not been initialized yet."; - // The device of any input of the operator is used for synchronization. - ICHECK_GT(arg_count, 0U); - ObjectRef arg = args[0]; - while (arg->IsInstance()) { - ADT adt = Downcast(arg); - arg = adt[0]; - } - ICHECK(arg->IsInstance()); - auto nd_array = Downcast(arg); - auto dev = nd_array->device; + if (prof_.IsRunning()) { + // The device of any input of the operator is used for synchronization. + ICHECK_GT(arg_count, 0U); + ObjectRef arg = args[0]; + while (arg->IsInstance()) { + ADT adt = Downcast(arg); + arg = adt[0]; + } + ICHECK(arg->IsInstance()); + auto nd_array = Downcast(arg); + auto dev = nd_array->device; + + // get argument sizes + std::vector shapes; + for (Index i = 0; i < arg_count; i++) { + if (const auto* obj = args[i].as()) { + for (size_t fi = 0; fi < obj->size; ++fi) { + auto o = (*obj)[fi]; + shapes.push_back(Downcast(o)); + } + } else { + shapes.push_back(Downcast(args[i])); + } + } - Timer t = Timer::Start(dev); + prof_.StartCall(packed_index_map_[packed_index], dev, + {{"Argument Shapes", profiling::ShapeString(shapes)}}); + } VirtualMachine::InvokePacked(packed_index, func, arg_count, output_size, args); - t->Stop(); - - op_timers_[packed_index].push_back(t); - op_invokes_[packed_index] += 1; + if (prof_.IsRunning()) { + prof_.StopCall(); + } } runtime::Module CreateVirtualMachineDebug(const Executable* exec) { diff --git a/src/runtime/vm/profiler/vm.h b/src/runtime/vm/profiler/vm.h index 9f5ce87bcf472..521a9bd454e7d 100644 --- a/src/runtime/vm/profiler/vm.h +++ b/src/runtime/vm/profiler/vm.h @@ -52,8 +52,7 @@ class VirtualMachineDebug : public VirtualMachine { const std::vector& args) final; std::unordered_map packed_index_map_; - std::unordered_map> op_timers_; - std::unordered_map op_invokes_; + profiling::Profiler prof_; }; } // namespace vm diff --git a/tests/cpp/profiling.cc b/tests/cpp/profiling_test.cc similarity index 100% rename from tests/cpp/profiling.cc rename to tests/cpp/profiling_test.cc diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py new file mode 100644 index 0000000000000..c4d855a3c7d3b --- /dev/null +++ b/tests/python/unittest/test_runtime_profiling.py @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import numpy as np +import pytest + +import tvm.testing +from tvm.runtime import profiler_vm +from tvm import relay +from tvm.relay.testing import mlp +from tvm.contrib.debugger import debug_executor + + +@pytest.mark.skipif(not profiler_vm.enabled(), reason="VM Profiler not enabled") +@tvm.testing.parametrize_targets +def test_vm(target, dev): + mod, params = mlp.get_workload(1) + + exe = relay.vm.compile(mod, target, params=params) + vm = profiler_vm.VirtualMachineProfiler(exe, dev) + + data = np.random.rand(1, 1, 28, 28).astype("float32") + report = vm.profile([data], func_name="main") + assert "fused_nn_softmax" in report + assert "Total time" in report + + +@tvm.testing.parametrize_targets +def test_graph_executor(target, dev): + mod, params = mlp.get_workload(1) + + exe = relay.build(mod, target, params=params) + gr = debug_executor.create(exe.get_json(), exe.lib, dev) + + data = np.random.rand(1, 1, 28, 28).astype("float32") + report = gr.profile(data=data) + assert "fused_nn_softmax" in report + assert "Total time" in report diff --git a/tests/python/unittest/test_runtime_vm_profiler.py b/tests/python/unittest/test_runtime_vm_profiler.py index c1c94211a664c..7eb867a85bd02 100644 --- a/tests/python/unittest/test_runtime_vm_profiler.py +++ b/tests/python/unittest/test_runtime_vm_profiler.py @@ -16,25 +16,25 @@ # under the License. import numpy as np +import tvm from tvm.runtime import profiler_vm from tvm import relay -from tvm.relay.testing import resnet, enabled_targets +from tvm.relay.testing import mlp -def test_basic(): - mod, params = resnet.get_workload() +@tvm.testing.parametrize_targets +def test_basic(dev, target): + mod, params = mlp.get_workload(batch_size=1) if not profiler_vm.enabled(): return - for target, dev in enabled_targets(): - exe = relay.vm.compile(mod, target, params=params) - vm = profiler_vm.VirtualMachineProfiler(exe, dev) + exe = relay.vm.compile(mod, target, params=params) + vm = profiler_vm.VirtualMachineProfiler(exe, dev) - data = np.random.rand(1, 3, 224, 224).astype("float32") - res = vm.invoke("main", [data]) - print("\n{}".format(vm.get_stat())) - print("\n{}".format(vm.get_stat(False))) + data = np.random.rand(1, 1, 28, 28).astype("float32") + res = vm.profile(tvm.nd.array(data), func_name="main") + assert "softmax" in res if __name__ == "__main__": - test_basic() + test_basic(tvm.cpu(), tvm.target.Target("llvm"))