From 2ac53b106751629c41dc0bb6844db9d6c3685783 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Thu, 1 Apr 2021 09:34:33 -0700
Subject: [PATCH] [Profiling,VM] Profiling interface for VM and Graph Runtime
 (#7624)

* [Profiling,VM] Profiling interface for VM and Graph Runtime

* lint

* fix test

* make profiling test actually run

* Try to better match the graph runtime function names to vm

* formatting

* DurationNode.value -> microseconds; PercentNode.value -> percent; make frame sorting optional.

* renaming for the tvmcontext -> device change

* formatting

* remove old vm profiler get_stat api

* fix tests
---
 include/tvm/runtime/profiling.h               | 141 +++++++++
 python/tvm/contrib/debugger/debug_executor.py |  19 ++
 python/tvm/runtime/profiler_vm.py             |  36 ++-
 .../debug/graph_executor_debug.cc             |  46 +++
 src/runtime/graph_executor/graph_executor.cc  |   2 +
 src/runtime/profiling.cc                      | 271 ++++++++++++++++++
 src/runtime/vm/profiler/vm.cc                 | 112 +++-----
 src/runtime/vm/profiler/vm.h                  |   3 +-
 tests/cpp/{profiling.cc => profiling_test.cc} |   0
 .../python/unittest/test_runtime_profiling.py |  51 ++++
 .../unittest/test_runtime_vm_profiler.py      |  22 +-
 11 files changed, 612 insertions(+), 91 deletions(-)
 rename tests/cpp/{profiling.cc => profiling_test.cc} (100%)
 create mode 100644 tests/python/unittest/test_runtime_profiling.py
diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
index fa16e7820b352..912ca26ec0f1e 100644
--- a/include/tvm/runtime/profiling.h
+++ b/include/tvm/runtime/profiling.h
@@ -27,9 +27,14 @@
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 
+#include <stack>
 #include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
 
 namespace tvm {
 namespace runtime {
@@ -144,6 +149,142 @@ class Timer : public ObjectRef {
  */
 Timer DefaultTimer(Device dev);
 
+namespace profiling {
+
+/*! Information about a single function or operator call. */
+struct CallFrame {
+  /*! Device on which the call was made */
+  Device dev;
+  /*! Name of the function or op */
+  String name;
+  /*! Runtime of the function or op */
+  Timer timer;
+  /*! Extra performance metrics */
+  std::unordered_map<std::string, ObjectRef> extra_metrics;
+};
+
+/*! Runtime profiler for function and/or operator calls. Used in the graph
+ * runtime and VM to provide profiling information for all operators.
+ *
+ * Example usage:
+ * \code{.cpp}
+ * Profiler prof;
+ * Device cpu, gpu;
+ * prof.Start({cpu, gpu});
+ * prof.StartCall("my_gpu_kernel", gpu);
+ * my_gpu_kernel();
+ * prof.StopCall();
+ * prof.StartCall("my_cpu_function", cpu);
+ * my_cpu_function();
+ * prof.StopCall();
+ * prof.Stop();
+ * std::cout << prof.Report << std::endl; // print profiling report
+ * \endcode
+ */
+class Profiler {
+ public:
+  /*! \brief Start the profiler.
+   * \param devs The list of devices the profiler will be running on. Should
+   *             include all devices used by profiled operators.
+   *
+   * This function should only be called once per object.
+   */
+  void Start(const std::vector<Device>& devs);
+  /*! \brief Stop the profiler.
+   *
+   * This function should only be called once per object after start has been called.
+   */
+  void Stop();
+  /*! \brief Start a function call.
+   * \param name The name of the function being called.
+   * \param dev The device on which the function is running.
+   * \param extra_metrics Optional additional profiling information to add to
+   * the frame (input sizes, allocations).
+   *
+   * `StartCall` may be nested, but each `StartCall` needs a matching
+   * `StopCall`. Function calls are stopped in LIFO order, so calls to
+   * `StartCall` and `StopCall` must be nested properly.
+   */
+  void StartCall(String name, Device dev,
+                 std::unordered_map<std::string, ObjectRef> extra_metrics = {});
+  /*! \brief Stop the last `StartCall`.
+   * \param extra_metrics Optional additional profiling information to add to
+   * the frame (input sizes, allocations).
+   */
+  void StopCall(std::unordered_map<std::string, ObjectRef> extra_metrics = {});
+  /*! \brief A textual report of total runtime between `Start` and `Stop` as
+   *        well as individual statistics for each `StartCall`-`StopCall` pair.
+   *  \param aggregate Whether or not to join multiple calls to the same op into a single line.
+   *  \param sort Whether or not to sort call frames by descending duration. If
+   *  false and if `aggregate` is false, frames will be sorted by order of
+   *  appearance in the program. Order is undefined if `sort` is false and
+   *  `aggregate` is true.
+   *  \returns The report as a string.
+   */
+  String Report(bool aggregate = true, bool sort = true);
+  /*! \brief Check if the profiler is currently running.
+   * \returns Whether or not the profiler is running.
+   */
+  bool IsRunning() const { return !global_timers_.empty(); }
+
+ private:
+  std::vector<std::pair<Device, Timer>> global_timers_;
+  std::vector<CallFrame> calls_;
+  std::stack<CallFrame> in_flight_;
+};
+
+/* \brief A duration in time. */
+class DurationNode : public Object {
+ public:
+  /* The duration as a floating point number of microseconds. */
+  double microseconds;
+
+  /* \brief Construct a new duration.
+   * \param a The duration in microseconds.
+   */
+  explicit DurationNode(double a) : microseconds(a) {}
+
+  static constexpr const char* _type_key = "runtime.profiling.Duration";
+  TVM_DECLARE_FINAL_OBJECT_INFO(DurationNode, Object);
+};
+
+/* A percentage of something */
+class PercentNode : public Object {
+ public:
+  /* The percent as a floating point value out of 100%. i.e. if `percent` is 10 then we have 10%. */
+  double percent;
+
+  /* \brief Construct a new percentage.
+   * \param a The percentage out of 100.
+   */
+  explicit PercentNode(double a) : percent(a) {}
+
+  static constexpr const char* _type_key = "runtime.profiling.Percent";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PercentNode, Object);
+};
+
+/* A count of something */
+class CountNode : public Object {
+ public:
+  /* The actual count */
+  int64_t value;
+
+  /* \brief Construct a new count.
+   * \param a The count.
+   */
+  explicit CountNode(int64_t a) : value(a) {}
+
+  static constexpr const char* _type_key = "runtime.profiling.Count";
+  TVM_DECLARE_FINAL_OBJECT_INFO(CountNode, Object);
+};
+
+/*! \brief String representation of an array or NDArray shapes
+ *  \param shapes Array of NDArrays to get the shapes of.
+ *  \return A textual representation of the shapes. For example: `float32[2], int64[1, 2]`.
+ */
+String ShapeString(const std::vector<NDArray>& shapes);
+
+}  // namespace profiling
 }  // namespace runtime
 }  // namespace tvm
 
diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py
index b27ae6533e385..1a1696baac4a3 100644
--- a/python/tvm/contrib/debugger/debug_executor.py
+++ b/python/tvm/contrib/debugger/debug_executor.py
@@ -100,6 +100,7 @@ def __init__(self, module, device, graph_json_str, dump_root):
         self._dump_path = None
         self._get_output_by_layer = module["get_output_by_layer"]
         self._run_individual = module["run_individual"]
+        self._profile = module["profile"]
         graph_executor.GraphModule.__init__(self, module)
         self._create_debug_env(graph_json_str, device)
 
@@ -234,6 +235,24 @@ def run_individual(self, number, repeat=1, min_repeat_ms=0):
         ret = self._run_individual(number, repeat, min_repeat_ms)
         return ret.strip(",").split(",") if ret else []
 
+    def profile(self, **input_dict):
+        """Run forward execution of the graph and collect overall and per-op
+        performance metrics.
+
+        Parameters
+        ----------
+        input_dict : dict of str to NDArray
+            List of input values to be feed to
+        Return
+        ------
+        timing_results : str
+            Per-operator and whole graph timing results in a table format.
+        """
+        if input_dict:
+            self.set_input(**input_dict)
+
+        return self._profile()
+
     def exit(self):
         """Exits the dump folder and all its contents"""
         self._remove_dump_root()
diff --git a/python/tvm/runtime/profiler_vm.py b/python/tvm/runtime/profiler_vm.py
index e70ef320289dd..e1c3dc66a360d 100644
--- a/python/tvm/runtime/profiler_vm.py
+++ b/python/tvm/runtime/profiler_vm.py
@@ -20,6 +20,7 @@
 
 Provides extra APIs for profiling vm execution.
 """
+import warnings
 from tvm.runtime import _ffi_api
 from . import vm
 
@@ -37,26 +38,37 @@ def __init__(self, exe, device, memory_cfg=None):
         self.module = _ffi_api._VirtualMachineDebug(exe.module)
         self._init = self.module["init"]
         self._invoke = self.module["invoke"]
-        self._get_stat = self.module["get_stat"]
+        self._profile = self.module["profile"]
         self._set_input = self.module["set_input"]
-        self._reset = self.module["reset"]
         self._setup_device(device, memory_cfg)
 
-    def get_stat(self, sort_by_time=True):
+    def get_stat(self, sort_by_time=True):  # pylint: disable=unused-argument
         """Get the statistics of executed ops.
 
+        REMOVED, use profile method instead.
+        """
+        warnings.warn("get_stat has been removed, use profile instead")
+        return ""
+
+    def profile(self, *args, func_name="main", **kwargs):
+        """Profile a function call.
+
         Parameters
         ----------
-        sort_by_time: Optional[Boolean]
-           Set to indicate the returned results are sorted by execution time in
-           the descending order. It is printed in the random order if this
-           field is not set.
+        func_name : str
+            The name of the function.
+
+        args : list[tvm.runtime.NDArray] or list[np.ndarray]
+            The arguments to the function.
+
+        kwargs: dict of str to tvm.runtime.NDArray or np.ndarray
+            Named arguments to the function.
 
         Returns
         -------
-            The execution statistics in string.
+        timing_results : str
+            Overall and per-op timing results formatted in a table.
         """
-        return self._get_stat(sort_by_time)
-
-    def reset(self):
-        self._reset()
+        if args or kwargs:
+            self.set_input(func_name, *args, **kwargs)
+        return self._profile(func_name)
diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
index 7c1e6960f9f53..ef3f9553fab7f 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -228,6 +228,50 @@ class GraphExecutorDebug : public GraphExecutor {
 
     data_entry_[eid].CopyTo(data_out);
   }
+
+  /*!
+   * \brief Profile execution time of the module.
+   *
+   * We run the entire module while recording overall and per-op timing
+   * information. The module may be run multiple times to ensure everything is
+   * warmed up. This function is a more correct reflection of actual runtime of
+   * the module compared to GraphRuntimeDebug::RunIndividual as it runs the
+   * entire graph in order.
+   *
+   * \returns A table of per-op runtimes and total times.
+   */
+  String Profile() {
+    // warm up. 1 iteration does not seem enough.
+    for (int i = 0; i < 3; i++) {
+      GraphExecutor::Run();
+    }
+
+    profiling::Profiler prof;
+    prof.Start(devices_);
+    for (size_t i = 0; i < op_execs_.size(); ++i) {
+      if (op_execs_[i]) {
+        // get argument shapes
+        std::vector<NDArray> shapes;
+        for (const auto& e : nodes_[i].inputs) {
+          uint32_t eid = entry_id(e);
+          shapes.push_back(data_entry_[eid]);
+        }
+        for (uint32_t j = 0; j < nodes_[i].param.num_outputs; ++j) {
+          uint32_t eid = entry_id(i, j);
+          shapes.push_back(data_entry_[eid]);
+        }
+
+        uint32_t eid = entry_id(i, 0);
+        const Device& device = data_entry_[eid]->device;
+        prof.StartCall(nodes_[i].param.func_name, device,
+                       {{"Argument Shapes", profiling::ShapeString(shapes)}});
+        op_execs_[i]();
+        prof.StopCall();
+      }
+    }
+    prof.Stop();
+    return prof.Report();
+  }
 };
 
 /*!
@@ -260,6 +304,8 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name,
       ICHECK_GE(min_repeat_ms, 0);
       *rv = this->RunIndividual(number, repeat, min_repeat_ms);
     });
+  } else if (name == "profile") {
+    return TypedPackedFunc<String()>([sptr_to_self, this]() { return this->Profile(); });
   } else {
     return GraphExecutor::GetFunction(name, sptr_to_self);
   }
diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc
index f33dc9f175b7c..e4e2b5201d89b 100644
--- a/src/runtime/graph_executor/graph_executor.cc
+++ b/src/runtime/graph_executor/graph_executor.cc
@@ -26,6 +26,7 @@
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/profiling.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/serializer.h>
 
@@ -59,6 +60,7 @@ void GraphExecutor::Run() {
     if (op_execs_[i]) op_execs_[i]();
   }
 }
+
 /*!
  * \brief Initialize the graph executor with graph and device.
  * \param graph_json The execution graph.
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 6cf1034a3837b..d68e5334c8a60 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -22,11 +22,15 @@
  * \brief Runtime profiling including timers.
  */
 
+#include <tvm/ir/expr.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/profiling.h>
 
 #include <chrono>
+#include <iomanip>
+#include <iostream>
 #include <map>
+#include <numeric>
 
 namespace tvm {
 namespace runtime {
@@ -93,5 +97,272 @@ Timer Timer::Start(Device dev) {
 }
 
 TVM_REGISTER_GLOBAL("profiling.start_timer").set_body_typed(Timer::Start);
+
+namespace profiling {
+
+void Profiler::Start(const std::vector<Device>& devs) {
+  CHECK(global_timers_.empty()) << "You can only call Start once per Profiler.";
+  for (auto dev : devs) {
+    global_timers_.emplace_back(dev, Timer::Start(dev));
+  }
+}
+
+void Profiler::StartCall(String name, Device dev,
+                         std::unordered_map<std::string, ObjectRef> extra_metrics) {
+  in_flight_.push(CallFrame{dev, name, Timer::Start(dev), extra_metrics});
+}
+
+void Profiler::StopCall(std::unordered_map<std::string, ObjectRef> extra_metrics) {
+  CallFrame cf = in_flight_.top();
+  cf.timer->Stop();
+  for (auto& p : extra_metrics) {
+    cf.extra_metrics[p.first] = p.second;
+  }
+  in_flight_.pop();
+  calls_.push_back(cf);
+}
+
+void Profiler::Stop() {
+  // Stop all global timers. We wait to synchronize until we are making the report.
+  for (auto p : global_timers_) {
+    p.second->Stop();
+  }
+}
+
+String ShapeString(const std::vector<NDArray>& shapes) {
+  std::stringstream sizes;
+  for (const NDArray& ary : shapes) {
+    if (sizes.tellp() > 0) {
+      sizes << ", ";
+    }
+    auto shape = ary.Shape();
+    sizes << ary.DataType() << "[";
+    for (size_t i = 0; i < shape.size(); i++) {
+      if (i != 0) {
+        sizes << ", ";
+      }
+      sizes << shape[i];
+    }
+    sizes << "]";
+  }
+  return String(sizes.str());
+}
+
+std::string FormatTable(const std::vector<std::unordered_map<std::string, ObjectRef>>& rows,
+                        std::unordered_set<std::string> hidden_cols = {"Argument Shapes",
+                                                                       "Device"}) {
+  std::unordered_set<std::string> unique_headers;
+
+  for (auto row : rows) {
+    for (auto p : row) {
+      unique_headers.insert(p.first);
+    }
+  }
+
+  std::vector<std::string> headers = {"Name", "Duration (us)", "Percent"};
+  for (auto header : unique_headers) {
+    if (header != "Name" && header != "Duration (us)" && header != "Percent" &&
+        hidden_cols.find(header) == hidden_cols.end()) {
+      headers.push_back(header);
+    }
+  }
+
+  std::vector<std::vector<std::string>> cols;
+  for (auto header : headers) {
+    cols.push_back({header});
+  }
+  for (auto row : rows) {
+    for (size_t i = 0; i < headers.size(); i++) {
+      auto it = row.find(headers[i]);
+      if (it == row.end()) {
+        cols[i].push_back("");
+      } else {
+        std::string val;
+        if (it->second.as<CountNode>()) {
+          std::stringstream s;
+          s.imbue(std::locale(""));  // for 1000s seperators
+          s << std::fixed << it->second.as<CountNode>()->value;
+          val = s.str();
+        } else if (it->second.as<DurationNode>()) {
+          std::stringstream s;
+          s.imbue(std::locale(""));  // for 1000s seperators
+          s << std::fixed << std::setprecision(2) << it->second.as<DurationNode>()->microseconds;
+          val = s.str();
+        } else if (it->second.as<PercentNode>()) {
+          std::stringstream s;
+          s << std::fixed << std::setprecision(2) << it->second.as<PercentNode>()->percent;
+          val = s.str();
+        } else if (it->second.as<StringObj>()) {
+          val = Downcast<String>(it->second);
+        }
+        cols[i].push_back(val);
+      }
+    }
+  }
+
+  std::vector<size_t> widths;
+  for (auto v : cols) {
+    size_t width = 0;
+    for (auto x : v) {
+      width = std::max(width, x.size());
+    }
+    widths.push_back(width);
+  }
+  size_t length = 0;
+  for (auto v : cols) {
+    length = std::max(length, v.size());
+  }
+
+  std::stringstream s;
+  for (size_t row = 0; row < length; row++) {
+    for (size_t col = 0; col < cols.size(); col++) {
+      // left align first column
+      if (col == 0) {
+        s << std::left;
+      } else {
+        s << std::right;
+      }
+      if (row < cols[col].size()) {
+        s << std::setw(widths[col]) << cols[col][row] << "  ";
+      } else {
+        s << std::setw(widths[col]) << ""
+          << "  ";
+      }
+    }
+    s << std::endl;
+  }
+  return s.str();
+}
+
+String Profiler::Report(bool aggregate, bool sort) {
+  std::vector<std::pair<Device, double>> global_times;
+  for (auto p : global_timers_) {
+    global_times.emplace_back(p.first, p.second->SyncAndGetElapsedNanos() / 1e3);
+  }
+  double overall_time = 0.;
+  for (auto p : global_times) {
+    overall_time = std::max(overall_time, p.second);
+  }
+
+  // aggregate times by op name
+  std::vector<std::pair<std::string, std::vector<size_t>>> aggregate_rows;
+  if (aggregate) {
+    std::unordered_map<std::string, std::vector<size_t>> aggregates;
+    for (size_t i = 0; i < calls_.size(); i++) {
+      CallFrame& cf = calls_[i];
+      std::string name = cf.name;
+      // don't aggregate dynamic ops with different shapes
+      auto it = cf.extra_metrics.find("Argument Shapes");
+      if (it != cf.extra_metrics.end()) {
+        name = name + Downcast<String>(it->second);
+      }
+
+      if (aggregates.find(name) == aggregates.end()) {
+        aggregates[name] = {i};
+      } else {
+        aggregates[name].push_back(i);
+      }
+    }
+    for (const auto& p : aggregates) {
+      aggregate_rows.push_back(p);
+    }
+  } else {
+    for (size_t i = 0; i < calls_.size(); i++) {
+      aggregate_rows.push_back({calls_[i].name, {i}});
+    }
+  }
+
+  // aggregated rows (poor man's dataframe)
+  std::vector<std::unordered_map<std::string, ObjectRef>> rows;
+
+  // form aggregates and compute aggregate statistics (sum).
+  for (auto p : aggregate_rows) {
+    std::unordered_map<std::string, ObjectRef> row;
+    double time_sum = 0;
+    size_t count = 0;
+    for (auto i : p.second) {
+      double us = calls_[i].timer->SyncAndGetElapsedNanos() / 1e3;
+      time_sum += us;
+      count += 1;
+    }
+    row["Percent"] = ObjectRef(make_object<PercentNode>(time_sum / overall_time * 100));
+    row["Duration (us)"] = ObjectRef(make_object<DurationNode>(time_sum));
+    row["Count"] = ObjectRef(make_object<CountNode>(count));
+    row["Name"] = calls_[p.second[0]].name;
+    Device dev = calls_[p.second[0]].dev;
+    row["Device"] = String(DeviceName(dev.device_type) + std::to_string(dev.device_id));
+
+    // assume all rows in the aggregate have the same metrics
+    for (auto metric : calls_[p.second[0]].extra_metrics) {
+      if (metric.second.as<CountNode>()) {
+        int64_t sum = 0;
+        for (auto i : p.second) {
+          sum += calls_[i].extra_metrics[metric.first].as<CountNode>()->value;
+        }
+        row[metric.first] = ObjectRef(make_object<CountNode>(sum));
+      } else if (metric.second.as<DurationNode>()) {
+        double sum = 0;
+        for (auto i : p.second) {
+          sum += calls_[i].extra_metrics[metric.first].as<DurationNode>()->microseconds;
+        }
+        row[metric.first] = ObjectRef(make_object<DurationNode>(sum));
+      } else if (metric.second.as<PercentNode>()) {
+        double sum = 0;
+        for (auto i : p.second) {
+          sum += calls_[i].extra_metrics[metric.first].as<PercentNode>()->percent;
+        }
+        row[metric.first] = ObjectRef(make_object<PercentNode>(sum));
+      } else if (metric.second.as<StringObj>()) {
+        // assume all rows contain the same value for this metric
+        row[metric.first] = Downcast<String>(metric.second);
+      }
+    }
+
+    rows.push_back(row);
+  }
+
+  // sort rows by duration
+  if (sort) {
+    std::sort(rows.begin(), rows.end(),
+              [&](const std::unordered_map<std::string, ObjectRef>& a,
+                  const std::unordered_map<std::string, ObjectRef>& b) {
+                return a.at("Duration (us)").as<DurationNode>()->microseconds >
+                       b.at("Duration (us)").as<DurationNode>()->microseconds;
+              });
+  }
+
+  double op_sum = 0;
+  int64_t total_count = 0;
+  double per = 0;
+  for (auto row : rows) {
+    op_sum += row["Duration (us)"].as<DurationNode>()->microseconds;
+    total_count += row["Count"].as<CountNode>()->value;
+    per += row["Percent"].as<PercentNode>()->percent;
+  }
+
+  rows.push_back({{"Name", String("------------------")}});
+  rows.push_back({{"Name", String("Total")},
+                  {"Duration (us)", ObjectRef(make_object<DurationNode>(op_sum))},
+                  {"Count", ObjectRef(make_object<CountNode>(total_count))},
+                  {"Percent", ObjectRef(make_object<PercentNode>(per))}});
+
+  std::stringstream s;
+  s.imbue(std::locale(""));
+  s << FormatTable(rows);
+  s << std::fixed << std::setprecision(2);
+  for (auto p : global_times) {
+    s << "Total time " << DeviceName(p.first.device_type) << p.first.device_id << ": " << p.second
+      << "us" << std::endl;
+  }
+  s << "Overhead: " << overall_time - op_sum << "us  "
+    << (overall_time - op_sum) / overall_time * 100 << "%  (Time not spent in operators)";
+
+  return s.str();
+}
+
+TVM_REGISTER_OBJECT_TYPE(DurationNode);
+TVM_REGISTER_OBJECT_TYPE(PercentNode);
+TVM_REGISTER_OBJECT_TYPE(CountNode);
+}  // namespace profiling
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc
index 3f3dee437cb35..2c921b7af0030 100644
--- a/src/runtime/vm/profiler/vm.cc
+++ b/src/runtime/vm/profiler/vm.cc
@@ -41,60 +41,26 @@ namespace vm {
 
 PackedFunc VirtualMachineDebug::GetFunction(const std::string& name,
                                             const ObjectPtr<Object>& sptr_to_self) {
-  if (name == "get_stat") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      ICHECK_EQ(args.size(), 1U);
-      std::vector<std::pair<Index, double>> op_acc_time;
-      std::unordered_map<Index, std::vector<double>> op_durations;
-      for (auto kv : op_timers_) {
-        std::vector<double> durations_us;
-        for (auto t : kv.second) {
-          durations_us.push_back(t->SyncAndGetElapsedNanos() / 1e3);
+  if (name == "profile") {
+    return TypedPackedFunc<String(String)>([sptr_to_self, this](String arg_name) {
+      std::vector<Device> devices;
+      for (auto dev : devices_) {
+        if (dev.device_type > 0) {
+          devices.push_back(dev);
         }
-        op_durations[kv.first] = durations_us;
       }
-      for (auto kv : op_durations) {
-        auto val =
-            std::make_pair(kv.first, std::accumulate(kv.second.begin(), kv.second.end(), 0.0));
-        op_acc_time.push_back(val);
-      }
-      bool sort_by_time = args[0];
-      if (sort_by_time) {
-        auto comp = [](const std::pair<Index, double>& lhs, const std::pair<Index, double>& rhs) {
-          return lhs.second > rhs.second;
-        };
-        std::sort(op_acc_time.begin(), op_acc_time.end(), comp);
-      }
-      double total_duration = 0.0;
-      int64_t total_packed_funcs = 0;
-      std::ostringstream os;
-      os << std::setw(30) << std::left << "#OpName"
-         << "\t" << std::setw(10) << std::left << "#InvokeCount"
-         << "\t"
-         << "#Duration(us): Sum/Mean/Min/Max" << std::endl;
-
-      for (auto kv : op_acc_time) {
-        auto vals = op_durations[kv.first];
-        auto sum = kv.second;
-        auto mean = sum / static_cast<double>(vals.size());
-        auto min_value = *std::min_element(vals.begin(), vals.end());
-        auto max_value = *std::max_element(vals.begin(), vals.end());
 
-        os << std::setw(30) << std::left << packed_index_map_[kv.first] << "\t" << std::setw(10)
-           << std::left << op_invokes_[kv.first] << "\t" << sum << "/" << mean << "/" << min_value
-           << "/" << max_value << std::endl;
-
-        total_duration += sum;
-        total_packed_funcs += op_invokes_[kv.first];
+      auto invoke = VirtualMachine::GetFunction("invoke", sptr_to_self);
+      // warmup
+      for (int i = 0; i < 3; i++) {
+        invoke(arg_name);
       }
-      os << "\nTotal Duration: " << total_duration << " us.\t"
-         << "Total Packed Functions: " << total_packed_funcs << std::endl;
-      *rv = os.str();
-    });
-  } else if (name == "reset") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      op_timers_.clear();
-      op_invokes_.clear();
+
+      prof_ = profiling::Profiler();  // reset profiler
+      prof_.Start(devices);
+      invoke(arg_name);
+      prof_.Stop();
+      return prof_.Report();
     });
   } else {
     return VirtualMachine::GetFunction(name, sptr_to_self);
@@ -106,7 +72,6 @@ void VirtualMachineDebug::LoadExecutable(const Executable* exec) {
   ICHECK(exec_);
   for (auto kv : exec_->primitive_map) {
     packed_index_map_[kv.second] = kv.first;
-    op_invokes_[kv.second] = 0;
   }
 }
 
@@ -114,23 +79,38 @@ void VirtualMachineDebug::InvokePacked(Index packed_index, const PackedFunc& fun
                                        Index output_size, const std::vector<ObjectRef>& args) {
   ICHECK(exec_);
   ICHECK(!devices_.empty()) << "Device has not been initialized yet.";
-  // The device of any input of the operator is used for synchronization.
-  ICHECK_GT(arg_count, 0U);
-  ObjectRef arg = args[0];
-  while (arg->IsInstance<ADTObj>()) {
-    ADT adt = Downcast<ADT>(arg);
-    arg = adt[0];
-  }
-  ICHECK(arg->IsInstance<NDArray::ContainerType>());
-  auto nd_array = Downcast<NDArray>(arg);
-  auto dev = nd_array->device;
+  if (prof_.IsRunning()) {
+    // The device of any input of the operator is used for synchronization.
+    ICHECK_GT(arg_count, 0U);
+    ObjectRef arg = args[0];
+    while (arg->IsInstance<ADTObj>()) {
+      ADT adt = Downcast<ADT>(arg);
+      arg = adt[0];
+    }
+    ICHECK(arg->IsInstance<NDArray::ContainerType>());
+    auto nd_array = Downcast<NDArray>(arg);
+    auto dev = nd_array->device;
+
+    // get argument sizes
+    std::vector<NDArray> shapes;
+    for (Index i = 0; i < arg_count; i++) {
+      if (const auto* obj = args[i].as<ADTObj>()) {
+        for (size_t fi = 0; fi < obj->size; ++fi) {
+          auto o = (*obj)[fi];
+          shapes.push_back(Downcast<NDArray>(o));
+        }
+      } else {
+        shapes.push_back(Downcast<NDArray>(args[i]));
+      }
+    }
 
-  Timer t = Timer::Start(dev);
+    prof_.StartCall(packed_index_map_[packed_index], dev,
+                    {{"Argument Shapes", profiling::ShapeString(shapes)}});
+  }
   VirtualMachine::InvokePacked(packed_index, func, arg_count, output_size, args);
-  t->Stop();
-
-  op_timers_[packed_index].push_back(t);
-  op_invokes_[packed_index] += 1;
+  if (prof_.IsRunning()) {
+    prof_.StopCall();
+  }
 }
 
 runtime::Module CreateVirtualMachineDebug(const Executable* exec) {
diff --git a/src/runtime/vm/profiler/vm.h b/src/runtime/vm/profiler/vm.h
index 9f5ce87bcf472..521a9bd454e7d 100644
--- a/src/runtime/vm/profiler/vm.h
+++ b/src/runtime/vm/profiler/vm.h
@@ -52,8 +52,7 @@ class VirtualMachineDebug : public VirtualMachine {
                     const std::vector<ObjectRef>& args) final;
 
   std::unordered_map<Index, std::string> packed_index_map_;
-  std::unordered_map<Index, std::vector<Timer>> op_timers_;
-  std::unordered_map<Index, int> op_invokes_;
+  profiling::Profiler prof_;
 };
 
 }  // namespace vm
diff --git a/tests/cpp/profiling.cc b/tests/cpp/profiling_test.cc
similarity index 100%
rename from tests/cpp/profiling.cc
rename to tests/cpp/profiling_test.cc
diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
new file mode 100644
index 0000000000000..c4d855a3c7d3b
--- /dev/null
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -0,0 +1,51 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import pytest
+
+import tvm.testing
+from tvm.runtime import profiler_vm
+from tvm import relay
+from tvm.relay.testing import mlp
+from tvm.contrib.debugger import debug_executor
+
+
+@pytest.mark.skipif(not profiler_vm.enabled(), reason="VM Profiler not enabled")
+@tvm.testing.parametrize_targets
+def test_vm(target, dev):
+    mod, params = mlp.get_workload(1)
+
+    exe = relay.vm.compile(mod, target, params=params)
+    vm = profiler_vm.VirtualMachineProfiler(exe, dev)
+
+    data = np.random.rand(1, 1, 28, 28).astype("float32")
+    report = vm.profile([data], func_name="main")
+    assert "fused_nn_softmax" in report
+    assert "Total time" in report
+
+
+@tvm.testing.parametrize_targets
+def test_graph_executor(target, dev):
+    mod, params = mlp.get_workload(1)
+
+    exe = relay.build(mod, target, params=params)
+    gr = debug_executor.create(exe.get_json(), exe.lib, dev)
+
+    data = np.random.rand(1, 1, 28, 28).astype("float32")
+    report = gr.profile(data=data)
+    assert "fused_nn_softmax" in report
+    assert "Total time" in report
diff --git a/tests/python/unittest/test_runtime_vm_profiler.py b/tests/python/unittest/test_runtime_vm_profiler.py
index c1c94211a664c..7eb867a85bd02 100644
--- a/tests/python/unittest/test_runtime_vm_profiler.py
+++ b/tests/python/unittest/test_runtime_vm_profiler.py
@@ -16,25 +16,25 @@
 # under the License.
 import numpy as np
 
+import tvm
 from tvm.runtime import profiler_vm
 from tvm import relay
-from tvm.relay.testing import resnet, enabled_targets
+from tvm.relay.testing import mlp
 
 
-def test_basic():
-    mod, params = resnet.get_workload()
+@tvm.testing.parametrize_targets
+def test_basic(dev, target):
+    mod, params = mlp.get_workload(batch_size=1)
     if not profiler_vm.enabled():
         return
 
-    for target, dev in enabled_targets():
-        exe = relay.vm.compile(mod, target, params=params)
-        vm = profiler_vm.VirtualMachineProfiler(exe, dev)
+    exe = relay.vm.compile(mod, target, params=params)
+    vm = profiler_vm.VirtualMachineProfiler(exe, dev)
 
-        data = np.random.rand(1, 3, 224, 224).astype("float32")
-        res = vm.invoke("main", [data])
-        print("\n{}".format(vm.get_stat()))
-        print("\n{}".format(vm.get_stat(False)))
+    data = np.random.rand(1, 1, 28, 28).astype("float32")
+    res = vm.profile(tvm.nd.array(data), func_name="main")
+    assert "softmax" in res
 
 
 if __name__ == "__main__":
-    test_basic()
+    test_basic(tvm.cpu(), tvm.target.Target("llvm"))