[Profiling,VM] Profiling interface for VM and Graph Runtime (apache#7624

) * [Profiling,VM] Profiling interface for VM and Graph Runtime * lint * fix test * make profiling test actually run * Try to better match the graph runtime function names to vm * formatting * DurationNode.value -> microseconds; PercentNode.value -> percent; make frame sorting optional. * renaming for the tvmcontext -> device change * formatting * remove old vm profiler get_stat api * fix tests
trevor-m · May 6, 2021 · 2ac53b1 · 2ac53b1
1 parent a7c1df3
commit 2ac53b1
Show file tree

Hide file tree

Showing 11 changed files with 612 additions and 91 deletions.
diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
@@ -27,9 +27,14 @@
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 
+#include <stack>
 #include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
 
 namespace tvm {
 namespace runtime {
@@ -144,6 +149,142 @@ class Timer : public ObjectRef {
  */
 Timer DefaultTimer(Device dev);
 
+namespace profiling {
+
+/*! Information about a single function or operator call. */
+struct CallFrame {
+  /*! Device on which the call was made */
+  Device dev;
+  /*! Name of the function or op */
+  String name;
+  /*! Runtime of the function or op */
+  Timer timer;
+  /*! Extra performance metrics */
+  std::unordered_map<std::string, ObjectRef> extra_metrics;
+};
+
+/*! Runtime profiler for function and/or operator calls. Used in the graph
+ * runtime and VM to provide profiling information for all operators.
+ *
+ * Example usage:
+ * \code{.cpp}
+ * Profiler prof;
+ * Device cpu, gpu;
+ * prof.Start({cpu, gpu});
+ * prof.StartCall("my_gpu_kernel", gpu);
+ * my_gpu_kernel();
+ * prof.StopCall();
+ * prof.StartCall("my_cpu_function", cpu);
+ * my_cpu_function();
+ * prof.StopCall();
+ * prof.Stop();
+ * std::cout << prof.Report << std::endl; // print profiling report
+ * \endcode
+ */
+class Profiler {
+ public:
+  /*! \brief Start the profiler.
+   * \param devs The list of devices the profiler will be running on. Should
+   *             include all devices used by profiled operators.
+   *
+   * This function should only be called once per object.
+   */
+  void Start(const std::vector<Device>& devs);
+  /*! \brief Stop the profiler.
+   *
+   * This function should only be called once per object after start has been called.
+   */
+  void Stop();
+  /*! \brief Start a function call.
+   * \param name The name of the function being called.
+   * \param dev The device on which the function is running.
+   * \param extra_metrics Optional additional profiling information to add to
+   * the frame (input sizes, allocations).
+   *
+   * `StartCall` may be nested, but each `StartCall` needs a matching
+   * `StopCall`. Function calls are stopped in LIFO order, so calls to
+   * `StartCall` and `StopCall` must be nested properly.
+   */
+  void StartCall(String name, Device dev,
+                 std::unordered_map<std::string, ObjectRef> extra_metrics = {});
+  /*! \brief Stop the last `StartCall`.
+   * \param extra_metrics Optional additional profiling information to add to
+   * the frame (input sizes, allocations).
+   */
+  void StopCall(std::unordered_map<std::string, ObjectRef> extra_metrics = {});
+  /*! \brief A textual report of total runtime between `Start` and `Stop` as
+   *        well as individual statistics for each `StartCall`-`StopCall` pair.
+   *  \param aggregate Whether or not to join multiple calls to the same op into a single line.
+   *  \param sort Whether or not to sort call frames by descending duration. If
+   *  false and if `aggregate` is false, frames will be sorted by order of
+   *  appearance in the program. Order is undefined if `sort` is false and
+   *  `aggregate` is true.
+   *  \returns The report as a string.
+   */
+  String Report(bool aggregate = true, bool sort = true);
+  /*! \brief Check if the profiler is currently running.
+   * \returns Whether or not the profiler is running.
+   */
+  bool IsRunning() const { return !global_timers_.empty(); }
+
+ private:
+  std::vector<std::pair<Device, Timer>> global_timers_;
+  std::vector<CallFrame> calls_;
+  std::stack<CallFrame> in_flight_;
+};
+
+/* \brief A duration in time. */
+class DurationNode : public Object {
+ public:
+  /* The duration as a floating point number of microseconds. */
+  double microseconds;
+
+  /* \brief Construct a new duration.
+   * \param a The duration in microseconds.
+   */
+  explicit DurationNode(double a) : microseconds(a) {}
+
+  static constexpr const char* _type_key = "runtime.profiling.Duration";
+  TVM_DECLARE_FINAL_OBJECT_INFO(DurationNode, Object);
+};
+
+/* A percentage of something */
+class PercentNode : public Object {
+ public:
+  /* The percent as a floating point value out of 100%. i.e. if `percent` is 10 then we have 10%. */
+  double percent;
+
+  /* \brief Construct a new percentage.
+   * \param a The percentage out of 100.
+   */
+  explicit PercentNode(double a) : percent(a) {}
+
+  static constexpr const char* _type_key = "runtime.profiling.Percent";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PercentNode, Object);
+};
+
+/* A count of something */
+class CountNode : public Object {
+ public:
+  /* The actual count */
+  int64_t value;
+
+  /* \brief Construct a new count.
+   * \param a The count.
+   */
+  explicit CountNode(int64_t a) : value(a) {}
+
+  static constexpr const char* _type_key = "runtime.profiling.Count";
+  TVM_DECLARE_FINAL_OBJECT_INFO(CountNode, Object);
+};
+
+/*! \brief String representation of an array or NDArray shapes
+ *  \param shapes Array of NDArrays to get the shapes of.
+ *  \return A textual representation of the shapes. For example: `float32[2], int64[1, 2]`.
+ */
+String ShapeString(const std::vector<NDArray>& shapes);
+
+}  // namespace profiling
 }  // namespace runtime
 }  // namespace tvm
 

diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py
@@ -100,6 +100,7 @@ def __init__(self, module, device, graph_json_str, dump_root):
         self._dump_path = None
         self._get_output_by_layer = module["get_output_by_layer"]
         self._run_individual = module["run_individual"]
+        self._profile = module["profile"]
         graph_executor.GraphModule.__init__(self, module)
         self._create_debug_env(graph_json_str, device)
 
@@ -234,6 +235,24 @@ def run_individual(self, number, repeat=1, min_repeat_ms=0):
         ret = self._run_individual(number, repeat, min_repeat_ms)
         return ret.strip(",").split(",") if ret else []
 
+    def profile(self, **input_dict):
+        """Run forward execution of the graph and collect overall and per-op
+        performance metrics.
+
+        Parameters
+        ----------
+        input_dict : dict of str to NDArray
+            List of input values to be feed to
+        Return
+        ------
+        timing_results : str
+            Per-operator and whole graph timing results in a table format.
+        """
+        if input_dict:
+            self.set_input(**input_dict)
+
+        return self._profile()
+
     def exit(self):
         """Exits the dump folder and all its contents"""
         self._remove_dump_root()
diff --git a/python/tvm/runtime/profiler_vm.py b/python/tvm/runtime/profiler_vm.py
@@ -20,6 +20,7 @@
 
 Provides extra APIs for profiling vm execution.
 """
+import warnings
 from tvm.runtime import _ffi_api
 from . import vm
 
@@ -37,26 +38,37 @@ def __init__(self, exe, device, memory_cfg=None):
         self.module = _ffi_api._VirtualMachineDebug(exe.module)
         self._init = self.module["init"]
         self._invoke = self.module["invoke"]
-        self._get_stat = self.module["get_stat"]
+        self._profile = self.module["profile"]
         self._set_input = self.module["set_input"]
-        self._reset = self.module["reset"]
         self._setup_device(device, memory_cfg)
 
-    def get_stat(self, sort_by_time=True):
+    def get_stat(self, sort_by_time=True):  # pylint: disable=unused-argument
         """Get the statistics of executed ops.
 
+        REMOVED, use profile method instead.
+        """
+        warnings.warn("get_stat has been removed, use profile instead")
+        return ""
+
+    def profile(self, *args, func_name="main", **kwargs):
+        """Profile a function call.
+
         Parameters
         ----------
-        sort_by_time: Optional[Boolean]
-           Set to indicate the returned results are sorted by execution time in
-           the descending order. It is printed in the random order if this
-           field is not set.
+        func_name : str
+            The name of the function.
+
+        args : list[tvm.runtime.NDArray] or list[np.ndarray]
+            The arguments to the function.
+
+        kwargs: dict of str to tvm.runtime.NDArray or np.ndarray
+            Named arguments to the function.
 
         Returns
         -------
-            The execution statistics in string.
+        timing_results : str
+            Overall and per-op timing results formatted in a table.
         """
-        return self._get_stat(sort_by_time)
-
-    def reset(self):
-        self._reset()
+        if args or kwargs:
+            self.set_input(func_name, *args, **kwargs)
+        return self._profile(func_name)
diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -228,6 +228,50 @@ class GraphExecutorDebug : public GraphExecutor {
 
     data_entry_[eid].CopyTo(data_out);
   }
+
+  /*!
+   * \brief Profile execution time of the module.
+   *
+   * We run the entire module while recording overall and per-op timing
+   * information. The module may be run multiple times to ensure everything is
+   * warmed up. This function is a more correct reflection of actual runtime of
+   * the module compared to GraphRuntimeDebug::RunIndividual as it runs the
+   * entire graph in order.
+   *
+   * \returns A table of per-op runtimes and total times.
+   */
+  String Profile() {
+    // warm up. 1 iteration does not seem enough.
+    for (int i = 0; i < 3; i++) {
+      GraphExecutor::Run();
+    }
+
+    profiling::Profiler prof;
+    prof.Start(devices_);
+    for (size_t i = 0; i < op_execs_.size(); ++i) {
+      if (op_execs_[i]) {
+        // get argument shapes
+        std::vector<NDArray> shapes;
+        for (const auto& e : nodes_[i].inputs) {
+          uint32_t eid = entry_id(e);
+          shapes.push_back(data_entry_[eid]);
+        }
+        for (uint32_t j = 0; j < nodes_[i].param.num_outputs; ++j) {
+          uint32_t eid = entry_id(i, j);
+          shapes.push_back(data_entry_[eid]);
+        }
+
+        uint32_t eid = entry_id(i, 0);
+        const Device& device = data_entry_[eid]->device;
+        prof.StartCall(nodes_[i].param.func_name, device,
+                       {{"Argument Shapes", profiling::ShapeString(shapes)}});
+        op_execs_[i]();
+        prof.StopCall();
+      }
+    }
+    prof.Stop();
+    return prof.Report();
+  }
 };
 
 /*!
@@ -260,6 +304,8 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name,
       ICHECK_GE(min_repeat_ms, 0);
       *rv = this->RunIndividual(number, repeat, min_repeat_ms);
     });
+  } else if (name == "profile") {
+    return TypedPackedFunc<String()>([sptr_to_self, this]() { return this->Profile(); });
   } else {
     return GraphExecutor::GetFunction(name, sptr_to_self);
   }

diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc
@@ -26,6 +26,7 @@
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/profiling.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/serializer.h>
 
@@ -59,6 +60,7 @@ void GraphExecutor::Run() {
     if (op_execs_[i]) op_execs_[i]();
   }
 }
+
 /*!
  * \brief Initialize the graph executor with graph and device.
  * \param graph_json The execution graph.