Skip to content

Commit

Permalink
[Runtime] Flush L2 cache in time eval (#15305)
Browse files Browse the repository at this point in the history
This PR introduces an optional cache flush functionality to
`time_evaluator`. It is implemented by allocating two large empty
NDArrays on the device so that the L2 cache are flushed. This gives us
more accurate evaluation on the performance of a runtime function.
  • Loading branch information
spectrometerHBH authored Jul 18, 2023
1 parent e2d6511 commit c0946e1
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 22 deletions.
4 changes: 3 additions & 1 deletion include/tvm/runtime/profiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -579,13 +579,15 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
* defined by `repeats_to_cooldown`.
* \param repeats_to_cooldown The number of repeats before the
* cooldown is activated.
* \param cache_flush_bytes The number of bytes to flush from cache before
* \param f_preproc The function to be executed before we execute time
* evaluator.
* \return f_timer A timer function.
*/
PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms,
int limit_zero_time_iterations, int cooldown_interval_ms,
int repeats_to_cooldown, PackedFunc f_preproc = nullptr);
int repeats_to_cooldown, int cache_flush_bytes = 0,
PackedFunc f_preproc = nullptr);

} // namespace profiling
} // namespace runtime
Expand Down
5 changes: 5 additions & 0 deletions python/tvm/runtime/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ def time_evaluator(
limit_zero_time_iterations=100,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
cache_flush_bytes=0,
f_preproc="",
):
"""Get an evaluator that measures time cost of running function.
Expand Down Expand Up @@ -358,6 +359,9 @@ def time_evaluator(
repeats_to_cooldown: int, optional
The number of repeats before the cooldown is activated.
cache_flush_bytes: int, optional
The number of bytes to flush from the cache before each repeat.
f_preproc: str, optional
The preprocess function name we want to execute before executing the time evaluator.
Expand All @@ -384,6 +388,7 @@ def time_evaluator(
limit_zero_time_iterations,
cooldown_interval_ms,
repeats_to_cooldown,
cache_flush_bytes,
f_preproc,
)

Expand Down
5 changes: 3 additions & 2 deletions src/runtime/crt/common/crt_runtime_api.c
Original file line number Diff line number Diff line change
Expand Up @@ -489,14 +489,15 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re
int* ret_type_code) {
ret_val[0].v_handle = NULL;
ret_type_code[0] = kTVMNullptr;
if (num_args < 11) {
if (num_args < 12) {
TVMAPIErrorf("not enough args");
return kTvmErrorFunctionCallNumArguments;
}
if (type_codes[0] != kTVMModuleHandle || type_codes[1] != kTVMStr ||
type_codes[2] != kTVMArgInt || type_codes[3] != kTVMArgInt || type_codes[4] != kTVMArgInt ||
type_codes[5] != kTVMArgInt || type_codes[6] != kTVMArgInt || type_codes[7] != kTVMArgInt ||
type_codes[8] != kTVMArgInt || type_codes[9] != kTVMArgInt || type_codes[10] != kTVMStr) {
type_codes[8] != kTVMArgInt || type_codes[9] != kTVMArgInt || type_codes[10] != kTVMArgInt ||
type_codes[11] != kTVMStr) {
TVMAPIErrorf("one or more invalid arg types");
return kTvmErrorFunctionCallWrongArgType;
}
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/graph_executor/debug/graph_executor_debug.cc
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ std::vector<double> GraphExecutorDebug::RunOpRPC(int index, int number, int repe
->
operator()(module_, name, static_cast<int>(dev.device_type), dev.device_id, number,
repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms,
repeats_to_cooldown, "");
repeats_to_cooldown, /*cache_flush_bytes=*/0, "");

int num_flat_args = num_inputs + num_outputs;
auto values = std::make_unique<TVMValue[]>(num_flat_args);
Expand Down
16 changes: 13 additions & 3 deletions src/runtime/profiling.cc
Original file line number Diff line number Diff line change
Expand Up @@ -861,7 +861,7 @@ TVM_REGISTER_GLOBAL("runtime.profiling.ProfileFunction")

PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, int min_repeat_ms,
int limit_zero_time_iterations, int cooldown_interval_ms,
int repeats_to_cooldown, PackedFunc f_preproc) {
int repeats_to_cooldown, int cache_flush_bytes, PackedFunc f_preproc) {
ICHECK(pf != nullptr);

if (static_cast<int>(dev.device_type) == static_cast<int>(kDLMicroDev)) {
Expand All @@ -871,13 +871,20 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
}

auto ftimer = [pf, dev, number, repeat, min_repeat_ms, limit_zero_time_iterations,
cooldown_interval_ms, repeats_to_cooldown,
cooldown_interval_ms, repeats_to_cooldown, cache_flush_bytes,
f_preproc](TVMArgs args, TVMRetValue* rv) mutable {
TVMRetValue temp;
std::ostringstream os;
// skip first time call, to activate lazy compilation components.
pf.CallPacked(args, &temp);

// allocate two large arrays to flush L2 cache
NDArray arr1, arr2;
if (cache_flush_bytes > 0) {
arr1 = NDArray::Empty({cache_flush_bytes / 4}, {kDLInt, 32, 1}, dev);
arr2 = NDArray::Empty({cache_flush_bytes / 4}, {kDLInt, 32, 1}, dev);
}

DeviceAPI::Get(dev)->StreamSync(dev, nullptr);

for (int i = 0; i < repeat; ++i) {
Expand All @@ -892,7 +899,10 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
number = static_cast<int>(
std::max((min_repeat_ms / (duration_ms / number) + 1), number * golden_ratio));
}

if (cache_flush_bytes > 0) {
arr1.CopyFrom(arr2);
}
DeviceAPI::Get(dev)->StreamSync(dev, nullptr);
// start timing
Timer t = Timer::Start(dev);
for (int j = 0; j < number; ++j) {
Expand Down
28 changes: 14 additions & 14 deletions src/runtime/rpc/rpc_module.cc
Original file line number Diff line number Diff line change
Expand Up @@ -198,23 +198,23 @@ class RPCModuleNode final : public ModuleNode {
PackedFunc GetTimeEvaluator(const std::string& name, Device dev, int number, int repeat,
int min_repeat_ms, int limit_zero_time_iterations,
int cooldown_interval_ms, int repeats_to_cooldown,
const std::string& f_preproc_name) {
int cache_flush_bytes, const std::string& f_preproc_name) {
InitRemoteFunc(&remote_get_time_evaluator_, "runtime.RPCTimeEvaluator");
// Remove session mask because we pass dev by parts.
ICHECK_EQ(GetRPCSessionIndex(dev), sess_->table_index())
<< "ValueError: Need to pass the matched remote device to RPCModule.GetTimeEvaluator";
dev = RemoveRPCSessionMask(dev);

if (module_handle_ != nullptr) {
return remote_get_time_evaluator_(GetRef<Module>(this), name,
static_cast<int>(dev.device_type), dev.device_id, number,
repeat, min_repeat_ms, limit_zero_time_iterations,
cooldown_interval_ms, repeats_to_cooldown, f_preproc_name);
return remote_get_time_evaluator_(
GetRef<Module>(this), name, static_cast<int>(dev.device_type), dev.device_id, number,
repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms,
repeats_to_cooldown, cache_flush_bytes, f_preproc_name);
} else {
return remote_get_time_evaluator_(Optional<Module>(nullptr), name,
static_cast<int>(dev.device_type), dev.device_id, number,
repeat, min_repeat_ms, limit_zero_time_iterations,
cooldown_interval_ms, repeats_to_cooldown, f_preproc_name);
return remote_get_time_evaluator_(
Optional<Module>(nullptr), name, static_cast<int>(dev.device_type), dev.device_id, number,
repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms,
repeats_to_cooldown, cache_flush_bytes, f_preproc_name);
}
}

Expand Down Expand Up @@ -253,7 +253,7 @@ class RPCModuleNode final : public ModuleNode {
std::shared_ptr<RPCSession> sess_;
// remote function to get time evaluator
TypedPackedFunc<PackedFunc(Optional<Module>, std::string, int, int, int, int, int, int, int, int,
std::string)>
int, std::string)>
remote_get_time_evaluator_;
// remote function getter for modules.
TypedPackedFunc<PackedFunc(Module, std::string, bool)> remote_mod_get_function_;
Expand Down Expand Up @@ -372,7 +372,7 @@ inline void CPUCacheFlush(int begin_index, const TVMArgs& args) {
TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
.set_body_typed([](Optional<Module> opt_mod, std::string name, int device_type, int device_id,
int number, int repeat, int min_repeat_ms, int limit_zero_time_iterations,
int cooldown_interval_ms, int repeats_to_cooldown,
int cooldown_interval_ms, int repeats_to_cooldown, int cache_flush_bytes,
std::string f_preproc_name) {
Device dev;
dev.device_type = static_cast<DLDeviceType>(device_type);
Expand All @@ -384,7 +384,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
return static_cast<RPCModuleNode*>(m.operator->())
->GetTimeEvaluator(name, dev, number, repeat, min_repeat_ms,
limit_zero_time_iterations, cooldown_interval_ms,
repeats_to_cooldown, f_preproc_name);
repeats_to_cooldown, cache_flush_bytes, f_preproc_name);
} else {
PackedFunc f_preproc;
if (!f_preproc_name.empty()) {
Expand All @@ -397,7 +397,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
CHECK(pf != nullptr) << "Cannot find " << name << " in the global registry";
return profiling::WrapTimeEvaluator(pf, dev, number, repeat, min_repeat_ms,
limit_zero_time_iterations, cooldown_interval_ms,
repeats_to_cooldown, f_preproc);
repeats_to_cooldown, cache_flush_bytes, f_preproc);
}
} else {
auto* pf = runtime::Registry::Get(name);
Expand All @@ -411,7 +411,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
}
return profiling::WrapTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms,
limit_zero_time_iterations, cooldown_interval_ms,
repeats_to_cooldown, f_preproc);
repeats_to_cooldown, cache_flush_bytes, f_preproc);
}
});

Expand Down
2 changes: 1 addition & 1 deletion web/emcc/tvmjs_support.cc
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ class AsyncLocalSession : public LocalSession {
CHECK(time_exec != nullptr) << "Cannot find wasm.GetTimer in the global function";
(*time_exec)(TypedPackedFunc<void(int)>(finvoke), dev, number, repeat, min_repeat_ms,
limit_zero_time_iterations, cooldown_interval_ms, repeats_to_cooldown,
on_complete);
/*cache_flush_bytes=*/0, on_complete);
};
return PackedFunc(ftimer);
}
Expand Down

0 comments on commit c0946e1

Please sign in to comment.