Skip to content

Commit

Permalink
[AutoScheduler] Allow device specification for AutoScheduler Runners. (
Browse files Browse the repository at this point in the history
…apache#10123)

* Changed the python api to support device.

* Finished implementation and updated tests.

* Fix typo.
  • Loading branch information
Josh Fromm authored and ylc committed Feb 16, 2022
1 parent 629d336 commit 5fb57a1
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 14 deletions.
8 changes: 6 additions & 2 deletions include/tvm/auto_scheduler/measure.h
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,8 @@ class ProgramRunnerNode : public Object {
double cooldown_interval;
/*! \brief Whether to flush cache on CPU between repeated measurements. */
bool enable_cpu_cache_flush;
/*! \brief Which device to run on if multiple are avaialble. */
int device;

/*!
* \brief Run measurement and return results.
Expand Down Expand Up @@ -391,9 +393,10 @@ class LocalRunner : public ProgramRunner {
* \param min_repeat_ms The minimum duration of one repeat in milliseconds.
* \param cooldown_interval The cool down interval between two measurements.
* \param enable_cpu_cache_flush Whether to flush cache on CPU between repeated measurements.
* \param device Which device to run on if multiple are available.
*/
LocalRunner(int timeout, int number, int repeat, int min_repeat_ms, double cooldown_interval,
bool enable_cpu_cache_flush);
bool enable_cpu_cache_flush, int device);

TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(LocalRunner, ProgramRunner, LocalRunnerNode);
};
Expand Down Expand Up @@ -443,10 +446,11 @@ class RPCRunner : public ProgramRunner {
* \param min_repeat_ms The minimum duration of one repeat in milliseconds.
* \param cooldown_interval The cool down interval between two measurements.
* \param enable_cpu_cache_flush Whether to flush cache on CPU between repeated measurements.
* \param device Which device to run on if multiple are available.
*/
RPCRunner(const String& key, const String& host, int port, int priority, int n_parallel,
int timeout, int number, int repeat, int min_repeat_ms, double cooldown_interval,
bool enable_cpu_cache_flush);
bool enable_cpu_cache_flush, int device);

TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(RPCRunner, ProgramRunner, RPCRunnerNode);
};
Expand Down
28 changes: 25 additions & 3 deletions python/tvm/auto_scheduler/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,8 @@ class LocalRunner(ProgramRunner):
its actual latency during end-to-end inference.
To make this option effective, the argument `number` should also be set to 1.
This is only has effect on CPU task.
device: int = 0
Which device to run on if multiple are available.
"""

def __init__(
Expand All @@ -392,6 +394,7 @@ def __init__(
min_repeat_ms=100,
cooldown_interval=0.0,
enable_cpu_cache_flush=False,
device=0,
):
if enable_cpu_cache_flush:
number = 1
Expand All @@ -405,6 +408,7 @@ def __init__(
min_repeat_ms,
cooldown_interval,
enable_cpu_cache_flush,
device,
)


Expand Down Expand Up @@ -453,6 +457,8 @@ class RPCRunner(ProgramRunner):
its actual latency during end-to-end inference.
To make this option effective, the argument `number` should also be set to 1.
This is only has effect on CPU task.
device: int = 0
Which device to run on if multiple are available.
"""

def __init__(
Expand All @@ -468,6 +474,7 @@ def __init__(
min_repeat_ms=100,
cooldown_interval=0.0,
enable_cpu_cache_flush=False,
device=0,
):
self.__init_handle_by_constructor__(
_ffi_api.RPCRunner,
Expand All @@ -482,6 +489,7 @@ def __init__(
min_repeat_ms,
cooldown_interval,
enable_cpu_cache_flush,
device,
)

if check_remote(key, host, port, priority, timeout):
Expand Down Expand Up @@ -532,6 +540,8 @@ class LocalRPCMeasureContext:
its actual latency during end-to-end inference.
To make this option effective, the argument `number` should also be set to 1.
This is only has effect on CPU task.
device: int = 0
Which device to run on if multiple are available.
"""

def __init__(
Expand All @@ -544,6 +554,7 @@ def __init__(
min_repeat_ms=0,
cooldown_interval=0.0,
enable_cpu_cache_flush=False,
device=0,
):
# pylint: disable=import-outside-toplevel
from tvm.rpc.tracker import Tracker
Expand All @@ -570,6 +581,7 @@ def __init__(
min_repeat_ms,
cooldown_interval,
enable_cpu_cache_flush,
device,
)
# Wait for the processes to start
time.sleep(0.5)
Expand Down Expand Up @@ -871,14 +883,15 @@ def _timed_eval_func(
cooldown_interval,
enable_cpu_cache_flush,
verbose,
device,
):
inp = MeasureInput.deserialize(inp_serialized)
tic = time.time()
error_no = 0
error_msg = None
try:
func = module.load_module(build_res.filename)
dev = ndarray.device(str(inp.task.target), 0)
dev = ndarray.device(str(inp.task.target), device)
# Limitation:
# We can not get PackFunction directly in the remote mode as it is wrapped
# under the std::function. We could lift the restriction later once we fold
Expand Down Expand Up @@ -947,6 +960,7 @@ def local_run(
cooldown_interval=0,
enable_cpu_cache_flush=False,
verbose=1,
device=0,
):
"""
Run function of LocalRunner to test the performance of the input BuildResults.
Expand Down Expand Up @@ -986,6 +1000,8 @@ def local_run(
This is only has effect on CPU task.
verbose: int = 1
Verbosity level. 0 for silent, 1 to output information during program measuring.
device: int = 0
Which device to run on if multiple are available.
Returns
-------
Expand Down Expand Up @@ -1021,6 +1037,7 @@ def local_run(
cooldown_interval,
enable_cpu_cache_flush,
verbose,
device,
),
)
if isinstance(res, TimeoutError):
Expand Down Expand Up @@ -1067,6 +1084,7 @@ def _rpc_run(
cooldown_interval,
enable_cpu_cache_flush,
verbose,
device,
):
inp = MeasureInput.deserialize(inp_serialized)
tic = time.time()
Expand All @@ -1077,7 +1095,7 @@ def _rpc_run(
remote = request_remote(key, host, port, priority, timeout)
remote.upload(build_res.filename)
func = remote.load_module(os.path.split(build_res.filename)[1])
dev = remote.device(str(inp.task.target), 0)
dev = remote.device(str(inp.task.target), device)
# Limitation:
# We can not get PackFunction directly in the remote mode as it is wrapped
# under the std::function. We could lift the restriction later once we fold
Expand Down Expand Up @@ -1166,7 +1184,7 @@ def _rpc_run_worker(args):
res : MeasureResult
The measure result of this Runner thread.
"""
_, build_res, _, _, _, _, _, timeout, _, _, _, _, _, verbose = args
_, build_res, _, _, _, _, _, timeout, _, _, _, _, _, verbose, _ = args
if build_res.error_no != MeasureErrorNo.NO_ERROR:
return (
(MAX_FLOAT,),
Expand Down Expand Up @@ -1209,6 +1227,7 @@ def rpc_runner_run(
cooldown_interval=0.0,
enable_cpu_cache_flush=False,
verbose=1,
device=0,
):
"""Run function of RPCRunner to test the performance of the input BuildResults.
Expand Down Expand Up @@ -1257,6 +1276,8 @@ def rpc_runner_run(
This is only has effect on CPU task.
verbose: int = 1
Verbosity level. 0 for silent, 1 to output information during program measuring.
device: int = 0
Which device to run on if multiple are available.
Returns
-------
Expand Down Expand Up @@ -1284,6 +1305,7 @@ def rpc_runner_run(
cooldown_interval,
enable_cpu_cache_flush,
verbose,
device,
)
for inp, build_res in zip(inputs, build_results)
],
Expand Down
18 changes: 10 additions & 8 deletions src/auto_scheduler/measure.cc
Original file line number Diff line number Diff line change
Expand Up @@ -127,14 +127,15 @@ Array<BuildResult> LocalBuilderNode::Build(const Array<MeasureInput>& inputs, in

/********** LocalRunner **********/
LocalRunner::LocalRunner(int timeout, int number, int repeat, int min_repeat_ms,
double cooldown_interval, bool enable_cpu_cache_flush) {
double cooldown_interval, bool enable_cpu_cache_flush, int device) {
ObjectPtr<LocalRunnerNode> node = make_object<LocalRunnerNode>();
node->timeout = timeout;
node->number = number;
node->repeat = repeat;
node->min_repeat_ms = min_repeat_ms;
node->cooldown_interval = cooldown_interval;
node->enable_cpu_cache_flush = enable_cpu_cache_flush;
node->device = device;
data_ = std::move(node);
}

Expand All @@ -143,7 +144,7 @@ Array<MeasureResult> LocalRunnerNode::Run(const Array<MeasureInput>& inputs,
if (const auto* f = runtime::Registry::Get("auto_scheduler.local_runner.run")) {
Array<MeasureResult> results =
(*f)(inputs, build_results, timeout, number, repeat, min_repeat_ms, cooldown_interval,
enable_cpu_cache_flush, verbose);
enable_cpu_cache_flush, verbose, device);
return results;
}
LOG(FATAL) << "auto_scheduler.local_runner.run is not registered. "
Expand All @@ -155,7 +156,7 @@ Array<MeasureResult> LocalRunnerNode::Run(const Array<MeasureInput>& inputs,
/********** RPCRunner **********/
RPCRunner::RPCRunner(const String& key, const String& host, int port, int priority, int n_parallel,
int timeout, int number, int repeat, int min_repeat_ms,
double cooldown_interval, bool enable_cpu_cache_flush) {
double cooldown_interval, bool enable_cpu_cache_flush, int device) {
auto node = make_object<RPCRunnerNode>();
node->key = key;
node->host = host;
Expand All @@ -168,6 +169,7 @@ RPCRunner::RPCRunner(const String& key, const String& host, int port, int priori
node->min_repeat_ms = min_repeat_ms;
node->cooldown_interval = cooldown_interval;
node->enable_cpu_cache_flush = enable_cpu_cache_flush;
node->device = device;
data_ = std::move(node);
}

Expand All @@ -176,7 +178,7 @@ Array<MeasureResult> RPCRunnerNode::Run(const Array<MeasureInput>& inputs,
if (const auto* f = runtime::Registry::Get("auto_scheduler.rpc_runner.run")) {
Array<MeasureResult> results =
(*f)(inputs, build_results, key, host, port, priority, n_parallel, timeout, number, repeat,
min_repeat_ms, cooldown_interval, enable_cpu_cache_flush, verbose);
min_repeat_ms, cooldown_interval, enable_cpu_cache_flush, verbose, device);
return results;
} else {
LOG(FATAL) << "auto_scheduler.rpc_runner.run is not registered. "
Expand Down Expand Up @@ -409,17 +411,17 @@ TVM_REGISTER_GLOBAL("auto_scheduler.LocalBuilder")

TVM_REGISTER_GLOBAL("auto_scheduler.LocalRunner")
.set_body_typed([](int timeout, int number, int repeat, int min_repeat_ms,
double cooldown_interval, bool enable_cpu_cache_flush) {
double cooldown_interval, bool enable_cpu_cache_flush, int device) {
return LocalRunner(timeout, number, repeat, min_repeat_ms, cooldown_interval,
enable_cpu_cache_flush);
enable_cpu_cache_flush, device);
});

TVM_REGISTER_GLOBAL("auto_scheduler.RPCRunner")
.set_body_typed([](const String& key, const String& host, int port, int priority,
int n_parallel, int timeout, int number, int repeat, int min_repeat_ms,
double cooldown_interval, bool enable_cpu_cache_flush) {
double cooldown_interval, bool enable_cpu_cache_flush, int device) {
return RPCRunner(key, host, port, priority, n_parallel, timeout, number, repeat,
min_repeat_ms, cooldown_interval, enable_cpu_cache_flush);
min_repeat_ms, cooldown_interval, enable_cpu_cache_flush, device);
});

} // namespace auto_scheduler
Expand Down
2 changes: 1 addition & 1 deletion tests/python/relay/test_auto_scheduler_tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def tune_network(network, target):
log_file = fp.name

# Tuning
measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60)
measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60, device=0)
tuner = auto_scheduler.TaskScheduler(tasks, task_weights, callbacks=[])
tune_option = auto_scheduler.TuningOptions(
num_measure_trials=100,
Expand Down

0 comments on commit 5fb57a1

Please sign in to comment.