Skip to content

Commit

Permalink
[PROFILING] Add ability to profile a single function_profiling
Browse files Browse the repository at this point in the history
Add a new function `tvm.runtime.profiling.profile_function` which
collects performance metrics for a single function in an IRModule. For
example, collecting performance counters using `PAPIMetricCollector`.
This is helpful for optimizing kernels and schedules for a single
operator.
  • Loading branch information
Tristan Konolige committed Nov 23, 2021
1 parent 3f2ae34 commit 6f217ec
Show file tree
Hide file tree
Showing 4 changed files with 171 additions and 0 deletions.
31 changes: 31 additions & 0 deletions include/tvm/runtime/profiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,37 @@ String ShapeString(NDArray shape, DLDataType dtype);
*/
String ShapeString(const std::vector<int64_t>& shape, DLDataType dtype);

/*! \brief Collect performance information of a function execution. Usually
* used with a compiled PrimFunc (via tvm.build).
*
* This information can include performance counters like cache hits and FLOPs
* that are useful in debugging performance issues of individual PrimFuncs.
* Different metrics can be collected depending on which MetricCollector is
* used.
*
* Example usage:
* \code{.cpp}
* // Use PAPI to measure the number of floating point operations.
* PackedFunc profiler = ProfileModule(
* mod, "main", kDLCPU, 0, {CreatePAPIMetricCollector({{kDLCPU, 0}, {"PAPI_FP_OPS"}})});
* Report r = profiler(arg1, arg2, arg);
* std::cout << r << std::endl;
* \endcode
*
* \param mod Module to profile. Usually a PrimFunc that has been compiled to machine code.
* \param func_name Name of function to run in the module.
* \param device_type Device type to run on. Profiling will include performance
* metrics specific to this device type.
* \param device_id Id of device to run on.
* \param collectors List of different
* ways to collect metrics. See @MetricCollector.
* \returns A PackedFunc which takes the same arguments as the `mod[func_name]`
* and returns performance metrics as a `Map<String, ObjectRef>` where
* values can be `CountNode`, `DurationNode`, `PercentNode`.
*/
PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, int device_id,
Array<MetricCollector> collectors);

} // namespace profiling
} // namespace runtime
} // namespace tvm
Expand Down
43 changes: 43 additions & 0 deletions python/tvm/runtime/profiling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,49 @@ def __init__(self, dev: Device):
self.__init_handle_by_constructor__(_ffi_api.DeviceWrapper, dev)


def profile_function(mod, dev, collectors, func_name="main"):
"""Collect performance information of a function execution. Usually used with
a compiled PrimFunc.
This information can include performance counters like cache hits and FLOPs
that are useful in debugging performance issues of individual PrimFuncs.
Different metrics can be collected depending on which MetricCollector is
used.
Example
-------
.. code-block: python
f = tvm.build(my_func, target="llvm", name="my_func")
prof = tvm.runtime.profiling.profile_function(
f,
tvm.cpu(),
[tvm.runtime.profiling.PAPIMetricCollector({tvm.cpu(): ["PAPI_FP_OPS"]}),
)
counters = prof(*args)
print(counters)
Parameters
----------
mod: Module
Module containing the function to profile.
dev: Device
Device to run the function on.
collectors: List[MetricCollector]
:py:class:`MetricCollector`s which will collect performance information.
func_name: str
Name of the function in `mod` to profile. Defaults to "main".
Returns
-------
prof: PackedFunc[args, Dict[str, ObjectRef]]
PackedFunc which takes the same arguments as the `mod[func_name]` and
returns performance metrics as a `Dict[str, ObjectRef]` where values
can be `CountNode`, `DurationNode`, `PercentNode`.
"""
return _ffi_api.ProfileFunction(mod, func_name, dev.device_type, dev.device_id, collectors)


# We only enable this class when TVM is build with PAPI support
if _ffi.get_global_func("runtime.profiling.PAPIMetricCollector", allow_missing=True) is not None:

Expand Down
50 changes: 50 additions & 0 deletions src/runtime/profiling.cc
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,56 @@ TVM_REGISTER_GLOBAL("runtime.profiling.FromJSON").set_body_typed(Report::FromJSO
TVM_REGISTER_GLOBAL("runtime.profiling.DeviceWrapper").set_body_typed([](Device dev) {
return DeviceWrapper(dev);
});

PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, int device_id,
Array<MetricCollector> collectors) {
// Module::GetFunction is not const, so this lambda has to be mutable
return PackedFunc([=](TVMArgs args, TVMRetValue* ret) mutable {
PackedFunc f = mod.GetFunction(func_name);
Device dev{static_cast<DLDeviceType>(device_type), device_id};

// warmup
for (size_t i = 0; i < 10; i++) {
f.CallPacked(args, ret);
}

for (auto& collector : collectors) {
collector->Init({DeviceWrapper(dev)});
}
std::vector<ObjectRef> collector_data;
for (auto& collector : collectors) {
collector_data.push_back(collector->Start(dev));
}
// TODO(tkonolige): repeated calls if the runtime is small?
f.CallPacked(args, ret);
std::unordered_map<String, ObjectRef> results;
for (size_t i = 0; i < collectors.size(); i++) {
auto r = collectors[i]->Stop(collector_data[i]);
// We might want to do this in a separate loop to avoid unnecessary time
// spent before stopping subsequent collectors.
for (auto kv : r) {
results[kv.first] = kv.second;
}
}
*ret = Map<String, ObjectRef>(results);
});
}

TVM_REGISTER_GLOBAL("runtime.profiling.ProfileFunction")
.set_body_typed<PackedFunc(Module, String, int, int,
Array<MetricCollector>)>([](Module mod, String func_name,
int device_type, int device_id,
Array<MetricCollector> collectors) {
if (mod->type_key() == std::string("rpc")) {
LOG(FATAL)
<< "Profiling a module over RPC is not yet supported"; // because we can't send
// MetricCollectors over rpc.
throw;
} else {
return ProfileFunction(mod, func_name, device_type, device_id, collectors);
}
});

} // namespace profiling
} // namespace runtime
} // namespace tvm
47 changes: 47 additions & 0 deletions tests/python/unittest/test_runtime_profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from tvm import rpc
from tvm.contrib import utils
from tvm.runtime.profiling import Report
from tvm.script import tir as T


def read_csv(report):
Expand Down Expand Up @@ -195,6 +196,52 @@ def test_report_serialization():
)


@T.prim_func
def axpy_cpu(a: T.handle, b: T.handle, c: T.handle) -> None:
A = T.match_buffer(a, [10], "float64")
B = T.match_buffer(b, [10], "float64")
C = T.match_buffer(c, [10], "float64")
for i in range(10):
C[i] = A[i] + B[i]


@T.prim_func
def axpy_gpu(a: T.handle, b: T.handle, c: T.handle) -> None:
A = T.match_buffer(a, [10], "float64")
B = T.match_buffer(b, [10], "float64")
C = T.match_buffer(c, [10], "float64")
for i in T.thread_binding(0, 10, "threadIdx.x"):
C[i] = A[i] + B[i]


@tvm.testing.parametrize_targets("cuda", "llvm")
@pytest.mark.skipif(
tvm.get_global_func("runtime.profiling.PAPIMetricCollector", allow_missing=True) is None,
reason="PAPI profiling not enabled",
)
def test_profile_function(target, dev):
target = tvm.target.Target(target)
if str(target.kind) == "llvm":
metric = "PAPI_FP_OPS"
func = axpy_cpu
elif str(target.kind) == "cuda":
metric = (
"cuda:::gpu__compute_memory_access_throughput.max.pct_of_peak_sustained_region:device=0"
)
func = axpy_gpu
else:
pytest.skip(f"Target {target.kind} not supported by this test")
f = tvm.build(func, target=target)
a = tvm.nd.array(np.ones(10), device=dev)
b = tvm.nd.array(np.ones(10), device=dev)
c = tvm.nd.array(np.zeros(10), device=dev)
report = tvm.runtime.profiling.profile_function(
f, dev, [tvm.runtime.profiling.PAPIMetricCollector({dev: [metric]})]
)(a, b, c)
assert metric in report.keys()
assert report[metric].value > 0


if __name__ == "__main__":
import sys
import pytest
Expand Down

0 comments on commit 6f217ec

Please sign in to comment.