Skip to content

Commit

Permalink
[Profiling,VM] Profiling interface for VM and Graph Runtime (apache#7624
Browse files Browse the repository at this point in the history
)

* [Profiling,VM] Profiling interface for VM and Graph Runtime

* lint

* fix test

* make profiling test actually run

* Try to better match the graph runtime function names to vm

* formatting

* DurationNode.value -> microseconds; PercentNode.value -> percent; make frame sorting optional.

* renaming for the tvmcontext -> device change

* formatting

* remove old vm profiler get_stat api

* fix tests
  • Loading branch information
tkonolige authored and Trevor Morris committed May 6, 2021
1 parent a7c1df3 commit 2ac53b1
Show file tree
Hide file tree
Showing 11 changed files with 612 additions and 91 deletions.
141 changes: 141 additions & 0 deletions include/tvm/runtime/profiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,14 @@
#include <tvm/runtime/c_runtime_api.h>
#include <tvm/runtime/device_api.h>
#include <tvm/runtime/object.h>
#include <tvm/runtime/packed_func.h>
#include <tvm/runtime/registry.h>

#include <stack>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

namespace tvm {
namespace runtime {
Expand Down Expand Up @@ -144,6 +149,142 @@ class Timer : public ObjectRef {
*/
Timer DefaultTimer(Device dev);

namespace profiling {

/*! Information about a single function or operator call. */
struct CallFrame {
/*! Device on which the call was made */
Device dev;
/*! Name of the function or op */
String name;
/*! Runtime of the function or op */
Timer timer;
/*! Extra performance metrics */
std::unordered_map<std::string, ObjectRef> extra_metrics;
};

/*! Runtime profiler for function and/or operator calls. Used in the graph
* runtime and VM to provide profiling information for all operators.
*
* Example usage:
* \code{.cpp}
* Profiler prof;
* Device cpu, gpu;
* prof.Start({cpu, gpu});
* prof.StartCall("my_gpu_kernel", gpu);
* my_gpu_kernel();
* prof.StopCall();
* prof.StartCall("my_cpu_function", cpu);
* my_cpu_function();
* prof.StopCall();
* prof.Stop();
* std::cout << prof.Report << std::endl; // print profiling report
* \endcode
*/
class Profiler {
public:
/*! \brief Start the profiler.
* \param devs The list of devices the profiler will be running on. Should
* include all devices used by profiled operators.
*
* This function should only be called once per object.
*/
void Start(const std::vector<Device>& devs);
/*! \brief Stop the profiler.
*
* This function should only be called once per object after start has been called.
*/
void Stop();
/*! \brief Start a function call.
* \param name The name of the function being called.
* \param dev The device on which the function is running.
* \param extra_metrics Optional additional profiling information to add to
* the frame (input sizes, allocations).
*
* `StartCall` may be nested, but each `StartCall` needs a matching
* `StopCall`. Function calls are stopped in LIFO order, so calls to
* `StartCall` and `StopCall` must be nested properly.
*/
void StartCall(String name, Device dev,
std::unordered_map<std::string, ObjectRef> extra_metrics = {});
/*! \brief Stop the last `StartCall`.
* \param extra_metrics Optional additional profiling information to add to
* the frame (input sizes, allocations).
*/
void StopCall(std::unordered_map<std::string, ObjectRef> extra_metrics = {});
/*! \brief A textual report of total runtime between `Start` and `Stop` as
* well as individual statistics for each `StartCall`-`StopCall` pair.
* \param aggregate Whether or not to join multiple calls to the same op into a single line.
* \param sort Whether or not to sort call frames by descending duration. If
* false and if `aggregate` is false, frames will be sorted by order of
* appearance in the program. Order is undefined if `sort` is false and
* `aggregate` is true.
* \returns The report as a string.
*/
String Report(bool aggregate = true, bool sort = true);
/*! \brief Check if the profiler is currently running.
* \returns Whether or not the profiler is running.
*/
bool IsRunning() const { return !global_timers_.empty(); }

private:
std::vector<std::pair<Device, Timer>> global_timers_;
std::vector<CallFrame> calls_;
std::stack<CallFrame> in_flight_;
};

/* \brief A duration in time. */
class DurationNode : public Object {
public:
/* The duration as a floating point number of microseconds. */
double microseconds;

/* \brief Construct a new duration.
* \param a The duration in microseconds.
*/
explicit DurationNode(double a) : microseconds(a) {}

static constexpr const char* _type_key = "runtime.profiling.Duration";
TVM_DECLARE_FINAL_OBJECT_INFO(DurationNode, Object);
};

/* A percentage of something */
class PercentNode : public Object {
public:
/* The percent as a floating point value out of 100%. i.e. if `percent` is 10 then we have 10%. */
double percent;

/* \brief Construct a new percentage.
* \param a The percentage out of 100.
*/
explicit PercentNode(double a) : percent(a) {}

static constexpr const char* _type_key = "runtime.profiling.Percent";
TVM_DECLARE_FINAL_OBJECT_INFO(PercentNode, Object);
};

/* A count of something */
class CountNode : public Object {
public:
/* The actual count */
int64_t value;

/* \brief Construct a new count.
* \param a The count.
*/
explicit CountNode(int64_t a) : value(a) {}

static constexpr const char* _type_key = "runtime.profiling.Count";
TVM_DECLARE_FINAL_OBJECT_INFO(CountNode, Object);
};

/*! \brief String representation of an array or NDArray shapes
* \param shapes Array of NDArrays to get the shapes of.
* \return A textual representation of the shapes. For example: `float32[2], int64[1, 2]`.
*/
String ShapeString(const std::vector<NDArray>& shapes);

} // namespace profiling
} // namespace runtime
} // namespace tvm

Expand Down
19 changes: 19 additions & 0 deletions python/tvm/contrib/debugger/debug_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def __init__(self, module, device, graph_json_str, dump_root):
self._dump_path = None
self._get_output_by_layer = module["get_output_by_layer"]
self._run_individual = module["run_individual"]
self._profile = module["profile"]
graph_executor.GraphModule.__init__(self, module)
self._create_debug_env(graph_json_str, device)

Expand Down Expand Up @@ -234,6 +235,24 @@ def run_individual(self, number, repeat=1, min_repeat_ms=0):
ret = self._run_individual(number, repeat, min_repeat_ms)
return ret.strip(",").split(",") if ret else []

def profile(self, **input_dict):
"""Run forward execution of the graph and collect overall and per-op
performance metrics.
Parameters
----------
input_dict : dict of str to NDArray
List of input values to be feed to
Return
------
timing_results : str
Per-operator and whole graph timing results in a table format.
"""
if input_dict:
self.set_input(**input_dict)

return self._profile()

def exit(self):
"""Exits the dump folder and all its contents"""
self._remove_dump_root()
36 changes: 24 additions & 12 deletions python/tvm/runtime/profiler_vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
Provides extra APIs for profiling vm execution.
"""
import warnings
from tvm.runtime import _ffi_api
from . import vm

Expand All @@ -37,26 +38,37 @@ def __init__(self, exe, device, memory_cfg=None):
self.module = _ffi_api._VirtualMachineDebug(exe.module)
self._init = self.module["init"]
self._invoke = self.module["invoke"]
self._get_stat = self.module["get_stat"]
self._profile = self.module["profile"]
self._set_input = self.module["set_input"]
self._reset = self.module["reset"]
self._setup_device(device, memory_cfg)

def get_stat(self, sort_by_time=True):
def get_stat(self, sort_by_time=True): # pylint: disable=unused-argument
"""Get the statistics of executed ops.
REMOVED, use profile method instead.
"""
warnings.warn("get_stat has been removed, use profile instead")
return ""

def profile(self, *args, func_name="main", **kwargs):
"""Profile a function call.
Parameters
----------
sort_by_time: Optional[Boolean]
Set to indicate the returned results are sorted by execution time in
the descending order. It is printed in the random order if this
field is not set.
func_name : str
The name of the function.
args : list[tvm.runtime.NDArray] or list[np.ndarray]
The arguments to the function.
kwargs: dict of str to tvm.runtime.NDArray or np.ndarray
Named arguments to the function.
Returns
-------
The execution statistics in string.
timing_results : str
Overall and per-op timing results formatted in a table.
"""
return self._get_stat(sort_by_time)

def reset(self):
self._reset()
if args or kwargs:
self.set_input(func_name, *args, **kwargs)
return self._profile(func_name)
46 changes: 46 additions & 0 deletions src/runtime/graph_executor/debug/graph_executor_debug.cc
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,50 @@ class GraphExecutorDebug : public GraphExecutor {

data_entry_[eid].CopyTo(data_out);
}

/*!
* \brief Profile execution time of the module.
*
* We run the entire module while recording overall and per-op timing
* information. The module may be run multiple times to ensure everything is
* warmed up. This function is a more correct reflection of actual runtime of
* the module compared to GraphRuntimeDebug::RunIndividual as it runs the
* entire graph in order.
*
* \returns A table of per-op runtimes and total times.
*/
String Profile() {
// warm up. 1 iteration does not seem enough.
for (int i = 0; i < 3; i++) {
GraphExecutor::Run();
}

profiling::Profiler prof;
prof.Start(devices_);
for (size_t i = 0; i < op_execs_.size(); ++i) {
if (op_execs_[i]) {
// get argument shapes
std::vector<NDArray> shapes;
for (const auto& e : nodes_[i].inputs) {
uint32_t eid = entry_id(e);
shapes.push_back(data_entry_[eid]);
}
for (uint32_t j = 0; j < nodes_[i].param.num_outputs; ++j) {
uint32_t eid = entry_id(i, j);
shapes.push_back(data_entry_[eid]);
}

uint32_t eid = entry_id(i, 0);
const Device& device = data_entry_[eid]->device;
prof.StartCall(nodes_[i].param.func_name, device,
{{"Argument Shapes", profiling::ShapeString(shapes)}});
op_execs_[i]();
prof.StopCall();
}
}
prof.Stop();
return prof.Report();
}
};

/*!
Expand Down Expand Up @@ -260,6 +304,8 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name,
ICHECK_GE(min_repeat_ms, 0);
*rv = this->RunIndividual(number, repeat, min_repeat_ms);
});
} else if (name == "profile") {
return TypedPackedFunc<String()>([sptr_to_self, this]() { return this->Profile(); });
} else {
return GraphExecutor::GetFunction(name, sptr_to_self);
}
Expand Down
2 changes: 2 additions & 0 deletions src/runtime/graph_executor/graph_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <tvm/runtime/device_api.h>
#include <tvm/runtime/ndarray.h>
#include <tvm/runtime/packed_func.h>
#include <tvm/runtime/profiling.h>
#include <tvm/runtime/registry.h>
#include <tvm/runtime/serializer.h>

Expand Down Expand Up @@ -59,6 +60,7 @@ void GraphExecutor::Run() {
if (op_execs_[i]) op_execs_[i]();
}
}

/*!
* \brief Initialize the graph executor with graph and device.
* \param graph_json The execution graph.
Expand Down
Loading

0 comments on commit 2ac53b1

Please sign in to comment.