From 5fb57a150745c1dea06bc420dc26d8050aeea38e Mon Sep 17 00:00:00 2001 From: Josh Fromm Date: Tue, 1 Feb 2022 15:09:27 -0800 Subject: [PATCH] [AutoScheduler] Allow device specification for AutoScheduler Runners. (#10123) * Changed the python api to support device. * Finished implementation and updated tests. * Fix typo. --- include/tvm/auto_scheduler/measure.h | 8 ++++-- python/tvm/auto_scheduler/measure.py | 28 +++++++++++++++++-- src/auto_scheduler/measure.cc | 18 ++++++------ .../relay/test_auto_scheduler_tuning.py | 2 +- 4 files changed, 42 insertions(+), 14 deletions(-) diff --git a/include/tvm/auto_scheduler/measure.h b/include/tvm/auto_scheduler/measure.h index 20a93e280b55..8576468816cb 100755 --- a/include/tvm/auto_scheduler/measure.h +++ b/include/tvm/auto_scheduler/measure.h @@ -308,6 +308,8 @@ class ProgramRunnerNode : public Object { double cooldown_interval; /*! \brief Whether to flush cache on CPU between repeated measurements. */ bool enable_cpu_cache_flush; + /*! \brief Which device to run on if multiple are avaialble. */ + int device; /*! * \brief Run measurement and return results. @@ -391,9 +393,10 @@ class LocalRunner : public ProgramRunner { * \param min_repeat_ms The minimum duration of one repeat in milliseconds. * \param cooldown_interval The cool down interval between two measurements. * \param enable_cpu_cache_flush Whether to flush cache on CPU between repeated measurements. + * \param device Which device to run on if multiple are available. */ LocalRunner(int timeout, int number, int repeat, int min_repeat_ms, double cooldown_interval, - bool enable_cpu_cache_flush); + bool enable_cpu_cache_flush, int device); TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(LocalRunner, ProgramRunner, LocalRunnerNode); }; @@ -443,10 +446,11 @@ class RPCRunner : public ProgramRunner { * \param min_repeat_ms The minimum duration of one repeat in milliseconds. * \param cooldown_interval The cool down interval between two measurements. * \param enable_cpu_cache_flush Whether to flush cache on CPU between repeated measurements. + * \param device Which device to run on if multiple are available. */ RPCRunner(const String& key, const String& host, int port, int priority, int n_parallel, int timeout, int number, int repeat, int min_repeat_ms, double cooldown_interval, - bool enable_cpu_cache_flush); + bool enable_cpu_cache_flush, int device); TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(RPCRunner, ProgramRunner, RPCRunnerNode); }; diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py index dd2328732b9a..4148cdbd3c94 100644 --- a/python/tvm/auto_scheduler/measure.py +++ b/python/tvm/auto_scheduler/measure.py @@ -382,6 +382,8 @@ class LocalRunner(ProgramRunner): its actual latency during end-to-end inference. To make this option effective, the argument `number` should also be set to 1. This is only has effect on CPU task. + device: int = 0 + Which device to run on if multiple are available. """ def __init__( @@ -392,6 +394,7 @@ def __init__( min_repeat_ms=100, cooldown_interval=0.0, enable_cpu_cache_flush=False, + device=0, ): if enable_cpu_cache_flush: number = 1 @@ -405,6 +408,7 @@ def __init__( min_repeat_ms, cooldown_interval, enable_cpu_cache_flush, + device, ) @@ -453,6 +457,8 @@ class RPCRunner(ProgramRunner): its actual latency during end-to-end inference. To make this option effective, the argument `number` should also be set to 1. This is only has effect on CPU task. + device: int = 0 + Which device to run on if multiple are available. """ def __init__( @@ -468,6 +474,7 @@ def __init__( min_repeat_ms=100, cooldown_interval=0.0, enable_cpu_cache_flush=False, + device=0, ): self.__init_handle_by_constructor__( _ffi_api.RPCRunner, @@ -482,6 +489,7 @@ def __init__( min_repeat_ms, cooldown_interval, enable_cpu_cache_flush, + device, ) if check_remote(key, host, port, priority, timeout): @@ -532,6 +540,8 @@ class LocalRPCMeasureContext: its actual latency during end-to-end inference. To make this option effective, the argument `number` should also be set to 1. This is only has effect on CPU task. + device: int = 0 + Which device to run on if multiple are available. """ def __init__( @@ -544,6 +554,7 @@ def __init__( min_repeat_ms=0, cooldown_interval=0.0, enable_cpu_cache_flush=False, + device=0, ): # pylint: disable=import-outside-toplevel from tvm.rpc.tracker import Tracker @@ -570,6 +581,7 @@ def __init__( min_repeat_ms, cooldown_interval, enable_cpu_cache_flush, + device, ) # Wait for the processes to start time.sleep(0.5) @@ -871,6 +883,7 @@ def _timed_eval_func( cooldown_interval, enable_cpu_cache_flush, verbose, + device, ): inp = MeasureInput.deserialize(inp_serialized) tic = time.time() @@ -878,7 +891,7 @@ def _timed_eval_func( error_msg = None try: func = module.load_module(build_res.filename) - dev = ndarray.device(str(inp.task.target), 0) + dev = ndarray.device(str(inp.task.target), device) # Limitation: # We can not get PackFunction directly in the remote mode as it is wrapped # under the std::function. We could lift the restriction later once we fold @@ -947,6 +960,7 @@ def local_run( cooldown_interval=0, enable_cpu_cache_flush=False, verbose=1, + device=0, ): """ Run function of LocalRunner to test the performance of the input BuildResults. @@ -986,6 +1000,8 @@ def local_run( This is only has effect on CPU task. verbose: int = 1 Verbosity level. 0 for silent, 1 to output information during program measuring. + device: int = 0 + Which device to run on if multiple are available. Returns ------- @@ -1021,6 +1037,7 @@ def local_run( cooldown_interval, enable_cpu_cache_flush, verbose, + device, ), ) if isinstance(res, TimeoutError): @@ -1067,6 +1084,7 @@ def _rpc_run( cooldown_interval, enable_cpu_cache_flush, verbose, + device, ): inp = MeasureInput.deserialize(inp_serialized) tic = time.time() @@ -1077,7 +1095,7 @@ def _rpc_run( remote = request_remote(key, host, port, priority, timeout) remote.upload(build_res.filename) func = remote.load_module(os.path.split(build_res.filename)[1]) - dev = remote.device(str(inp.task.target), 0) + dev = remote.device(str(inp.task.target), device) # Limitation: # We can not get PackFunction directly in the remote mode as it is wrapped # under the std::function. We could lift the restriction later once we fold @@ -1166,7 +1184,7 @@ def _rpc_run_worker(args): res : MeasureResult The measure result of this Runner thread. """ - _, build_res, _, _, _, _, _, timeout, _, _, _, _, _, verbose = args + _, build_res, _, _, _, _, _, timeout, _, _, _, _, _, verbose, _ = args if build_res.error_no != MeasureErrorNo.NO_ERROR: return ( (MAX_FLOAT,), @@ -1209,6 +1227,7 @@ def rpc_runner_run( cooldown_interval=0.0, enable_cpu_cache_flush=False, verbose=1, + device=0, ): """Run function of RPCRunner to test the performance of the input BuildResults. @@ -1257,6 +1276,8 @@ def rpc_runner_run( This is only has effect on CPU task. verbose: int = 1 Verbosity level. 0 for silent, 1 to output information during program measuring. + device: int = 0 + Which device to run on if multiple are available. Returns ------- @@ -1284,6 +1305,7 @@ def rpc_runner_run( cooldown_interval, enable_cpu_cache_flush, verbose, + device, ) for inp, build_res in zip(inputs, build_results) ], diff --git a/src/auto_scheduler/measure.cc b/src/auto_scheduler/measure.cc index c3212f2b4478..abb77581e7ee 100755 --- a/src/auto_scheduler/measure.cc +++ b/src/auto_scheduler/measure.cc @@ -127,7 +127,7 @@ Array LocalBuilderNode::Build(const Array& inputs, in /********** LocalRunner **********/ LocalRunner::LocalRunner(int timeout, int number, int repeat, int min_repeat_ms, - double cooldown_interval, bool enable_cpu_cache_flush) { + double cooldown_interval, bool enable_cpu_cache_flush, int device) { ObjectPtr node = make_object(); node->timeout = timeout; node->number = number; @@ -135,6 +135,7 @@ LocalRunner::LocalRunner(int timeout, int number, int repeat, int min_repeat_ms, node->min_repeat_ms = min_repeat_ms; node->cooldown_interval = cooldown_interval; node->enable_cpu_cache_flush = enable_cpu_cache_flush; + node->device = device; data_ = std::move(node); } @@ -143,7 +144,7 @@ Array LocalRunnerNode::Run(const Array& inputs, if (const auto* f = runtime::Registry::Get("auto_scheduler.local_runner.run")) { Array results = (*f)(inputs, build_results, timeout, number, repeat, min_repeat_ms, cooldown_interval, - enable_cpu_cache_flush, verbose); + enable_cpu_cache_flush, verbose, device); return results; } LOG(FATAL) << "auto_scheduler.local_runner.run is not registered. " @@ -155,7 +156,7 @@ Array LocalRunnerNode::Run(const Array& inputs, /********** RPCRunner **********/ RPCRunner::RPCRunner(const String& key, const String& host, int port, int priority, int n_parallel, int timeout, int number, int repeat, int min_repeat_ms, - double cooldown_interval, bool enable_cpu_cache_flush) { + double cooldown_interval, bool enable_cpu_cache_flush, int device) { auto node = make_object(); node->key = key; node->host = host; @@ -168,6 +169,7 @@ RPCRunner::RPCRunner(const String& key, const String& host, int port, int priori node->min_repeat_ms = min_repeat_ms; node->cooldown_interval = cooldown_interval; node->enable_cpu_cache_flush = enable_cpu_cache_flush; + node->device = device; data_ = std::move(node); } @@ -176,7 +178,7 @@ Array RPCRunnerNode::Run(const Array& inputs, if (const auto* f = runtime::Registry::Get("auto_scheduler.rpc_runner.run")) { Array results = (*f)(inputs, build_results, key, host, port, priority, n_parallel, timeout, number, repeat, - min_repeat_ms, cooldown_interval, enable_cpu_cache_flush, verbose); + min_repeat_ms, cooldown_interval, enable_cpu_cache_flush, verbose, device); return results; } else { LOG(FATAL) << "auto_scheduler.rpc_runner.run is not registered. " @@ -409,17 +411,17 @@ TVM_REGISTER_GLOBAL("auto_scheduler.LocalBuilder") TVM_REGISTER_GLOBAL("auto_scheduler.LocalRunner") .set_body_typed([](int timeout, int number, int repeat, int min_repeat_ms, - double cooldown_interval, bool enable_cpu_cache_flush) { + double cooldown_interval, bool enable_cpu_cache_flush, int device) { return LocalRunner(timeout, number, repeat, min_repeat_ms, cooldown_interval, - enable_cpu_cache_flush); + enable_cpu_cache_flush, device); }); TVM_REGISTER_GLOBAL("auto_scheduler.RPCRunner") .set_body_typed([](const String& key, const String& host, int port, int priority, int n_parallel, int timeout, int number, int repeat, int min_repeat_ms, - double cooldown_interval, bool enable_cpu_cache_flush) { + double cooldown_interval, bool enable_cpu_cache_flush, int device) { return RPCRunner(key, host, port, priority, n_parallel, timeout, number, repeat, - min_repeat_ms, cooldown_interval, enable_cpu_cache_flush); + min_repeat_ms, cooldown_interval, enable_cpu_cache_flush, device); }); } // namespace auto_scheduler diff --git a/tests/python/relay/test_auto_scheduler_tuning.py b/tests/python/relay/test_auto_scheduler_tuning.py index bbf3c48d5e3f..1431824899ec 100644 --- a/tests/python/relay/test_auto_scheduler_tuning.py +++ b/tests/python/relay/test_auto_scheduler_tuning.py @@ -36,7 +36,7 @@ def tune_network(network, target): log_file = fp.name # Tuning - measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60) + measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60, device=0) tuner = auto_scheduler.TaskScheduler(tasks, task_weights, callbacks=[]) tune_option = auto_scheduler.TuningOptions( num_measure_trials=100,