From e0363a01dd854d4edf595116e1a75f69b502819b Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Sat, 21 Nov 2020 02:37:52 +0000 Subject: [PATCH 1/5] [AutoScheduler] Task scheduler callbacks --- python/tvm/auto_scheduler/task_scheduler.py | 149 ++++++++++++++---- tutorials/auto_scheduler/tune_network_cuda.py | 7 +- 2 files changed, 123 insertions(+), 33 deletions(-) diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py index 884741bd08cc..d6583f59322a 100644 --- a/python/tvm/auto_scheduler/task_scheduler.py +++ b/python/tvm/auto_scheduler/task_scheduler.py @@ -22,7 +22,7 @@ L. Zheng, C. Jia, M. Sun, Z. Wu, C. Yu, et al. "Ansor : Generating High-Performance Tensor Programs for Deep Learning." (OSDI 2020). """ - +import os import time import math import logging @@ -182,6 +182,7 @@ def __init__( beta: float = 2, gamma: float = 0.5, backward_window_size: int = 3, + callbacks=None, ): self.tasks = tasks if objective_func: # use custom objective function @@ -199,6 +200,7 @@ def __init__( self.beta = beta self.gamma = gamma self.backward_window_size = backward_window_size + self.callbacks = callbacks if callbacks is not None else [] assert len(self.tasks) != 0, "No tasks" assert self.strategy in ["round-robin", "gradient"] @@ -374,39 +376,12 @@ def tune(self, tune_option, search_policy="default"): ) break - def _print_table_info(self, next_task_idx): - # table header - _ffi_api.PrintTitle("Task Scheduler") - print("| ID | Latency (ms) | Speed (GFLOPS) | Trials |") - print("-------------------------------------------------") - - # content - for i in range(len(self.tasks)): - id_str = "%d" % i - latency_str = "%.3f" % (1e3 * self.best_costs[i]) if self.best_costs[i] < 1e9 else "-" - speed_str = ( - "%.2f" % (self.tasks[i].compute_dag.flop_ct / self.best_costs[i] / 1e9) - if self.best_costs[i] < 1e9 - else "-" - ) - trials_str = "%d" % (self.task_cts[i] * self.num_measures_per_round) - print("| %4s | %12s | % 14s | %6s |" % (id_str, latency_str, speed_str, trials_str)) - print("-------------------------------------------------") - - # overall info - if all(cost < 1e9 for cost in self.best_costs): - total_latency_str = "%.3f" % (self.cur_score * 1e3) - else: - total_latency_str = "-" - print( - "Estimated total latency: %s ms\tTrials: %d\tUsed time : %.0f s\tNext ID: %d\t" - % (total_latency_str, self.ct, time.time() - self.tic, next_task_idx) - ) - def _tune_task(self, task_idx): """Tune the select task for one round""" - if self.tune_option.verbose >= 1: - self._print_table_info(task_idx) + + # Run pre-tune callbacks + for callback in self.callbacks: + callback.pre_tune(self, task_idx) measure_inputs, measure_results = self.search_policies[task_idx].continue_search_one_round( self.num_measures_per_round, self.measurer @@ -426,6 +401,10 @@ def _tune_task(self, task_idx): self.ct += len(measure_inputs) self.cur_score = self._compute_score(self.best_costs) + # Run post-tune callbacks + for callback in self.callbacks: + callback.post_tune(self, task_idx) + def _compute_score(self, costs): """compute the objective function""" return self.objective_func(costs) @@ -478,3 +457,109 @@ def _restore_status(self, log_file, num_measures_per_round): self.cur_score = self._compute_score(self.best_costs) logger.info("TaskScheduler: Loaded %d measurement records from %s", total_ct + 1, log_file) + + +class TaskSchedulerCallback: + """The base class of task scheduler callback functions. """ + + def pre_tune(self, task_scheduler, task_id): + """The callback before tuning each task. + + Parameters + ---------- + task_scheduler: TaskScheduler + The task scheduler. + task_id: int + The task ID going to be tuned. + """ + pass + + def post_tune(self, task_scheduler, task_id): + """The callback after tuning each task. + + Parameters + ---------- + task_scheduler: TaskScheduler + The task scheduler. + task_id: int + The task ID be tuned. + """ + pass + + +class PrintTableInfoCallback(TaskSchedulerCallback): + """The callback that prints a table of current progress.""" + + def pre_tune(self, task_scheduler, task_id): + if task_scheduler.tune_option.verbose < 1: + return + + _ffi_api.PrintTitle("Task Scheduler") + print("| ID | Latency (ms) | Speed (GFLOPS) | Trials |") + print("-------------------------------------------------") + + # content + for i in range(len(task_scheduler.tasks)): + id_str = "%d" % i + latency_str = ( + "%.3f" % (1e3 * task_scheduler.best_costs[i]) + if task_scheduler.best_costs[i] < 1e9 + else "-" + ) + speed_str = ( + "%.2f" + % (task_scheduler.tasks[i].compute_dag.flop_ct / task_scheduler.best_costs[i] / 1e9) + if task_scheduler.best_costs[i] < 1e9 + else "-" + ) + trials_str = "%d" % (task_scheduler.task_cts[i] * task_scheduler.num_measures_per_round) + print("| %4s | %12s | % 14s | %6s |" % (id_str, latency_str, speed_str, trials_str)) + print("-------------------------------------------------") + + # overall info + if all(cost < 1e9 for cost in task_scheduler.best_costs): + total_latency_str = "%.3f" % (task_scheduler.cur_score * 1e3) + else: + total_latency_str = "-" + print( + "Estimated total latency: %s ms\tTrials: %d\tUsed time : %.0f s\tNext ID: %d\t" + % ( + total_latency_str, + task_scheduler.ct, + time.time() - task_scheduler.tic, + task_id, + ) + ) + + +class LogEstimatedLatencyCallback(TaskSchedulerCallback): + """Log the estimated latency to the file after tuning a task. + + Parameters + ---------- + log_file: str + The log file path. + """ + + def __init__(self, log_file): + if os.path.exists(log_file): # Remove existing log + os.remove(log_file) + + self.log_file = log_file + + def post_tune(self, task_scheduler, task_id): + if all(cost < 1e9 for cost in task_scheduler.best_costs): + total_latency_str = "%.3f" % (task_scheduler.cur_score * 1e3) + else: + total_latency_str = "N/A" + + with open(self.log_file, "a") as filep: + filep.write( + "ElapsedTime(s)\t%.0f\tEstimatedLatency(ms)\t%s\tTrials\t%d\n" + % ( + time.time() - task_scheduler.tic, + total_latency_str, + task_scheduler.ct, + ) + ) + filep.flush() diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/tutorials/auto_scheduler/tune_network_cuda.py index 723b8d15ea88..5b8fbc5f0f91 100644 --- a/tutorials/auto_scheduler/tune_network_cuda.py +++ b/tutorials/auto_scheduler/tune_network_cuda.py @@ -168,6 +168,9 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"): # * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement. # This can warmup the GPU, which is necessary to get accurate measurement results. # Typically, we recommend a value > 300 ms. +# * :code:`callbacks` could be a list of task scheduler callbacks during the tuning. You can use +# PrintTableInfoCallback to print a timely progress table. You can also use +# LogEstimatedLatencyCallback to log the estimated total latency to a file for better analysis. # * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning. # You can set it to a small number (e.g., 200) for a fast demonstrative run. # In practice, we recommend setting it around :code:`1000 * len(tasks)`, @@ -186,7 +189,9 @@ def run_tuning(): print("Begin tuning...") measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=400, timeout=10) - tuner = auto_scheduler.TaskScheduler(tasks, task_weights) + tuner = auto_scheduler.TaskScheduler( + tasks, task_weights, callbacks=[auto_scheduler.task_scheduler.PrintTableInfoCallback()] + ) tune_option = auto_scheduler.TuningOptions( num_measure_trials=200, # change this to 20000 to achieve the best performance runner=measure_ctx.runner, From 6b340303a9a225e7842ded9a28c72566f5ea357f Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Sat, 21 Nov 2020 02:40:16 +0000 Subject: [PATCH 2/5] docstring --- python/tvm/auto_scheduler/task_scheduler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py index d6583f59322a..c133823b65ae 100644 --- a/python/tvm/auto_scheduler/task_scheduler.py +++ b/python/tvm/auto_scheduler/task_scheduler.py @@ -168,6 +168,8 @@ class TaskScheduler: The parameter used for 'gradient' strategy backward_window_size: int = 3 The parameter used for 'gradient' strategy + callbacks: Optional[List[TaskSchedulerCallback]] + The task scheduler callbacks that will be called before and after tuning a task. """ def __init__( From 2880e6bb1df60e3b03554337e3a6b1f6cd14fc82 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Mon, 23 Nov 2020 18:47:41 +0000 Subject: [PATCH 3/5] address comments --- python/tvm/auto_scheduler/task_scheduler.py | 11 ++++++----- tutorials/auto_scheduler/tune_network_cuda.py | 11 +++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py index c133823b65ae..b6eeaf44abe4 100644 --- a/python/tvm/auto_scheduler/task_scheduler.py +++ b/python/tvm/auto_scheduler/task_scheduler.py @@ -170,6 +170,7 @@ class TaskScheduler: The parameter used for 'gradient' strategy callbacks: Optional[List[TaskSchedulerCallback]] The task scheduler callbacks that will be called before and after tuning a task. + If None, then PrintTableInfo callback will be used. """ def __init__( @@ -202,7 +203,7 @@ def __init__( self.beta = beta self.gamma = gamma self.backward_window_size = backward_window_size - self.callbacks = callbacks if callbacks is not None else [] + self.callbacks = [PrintTableInfo()] if callbacks is not None else [] assert len(self.tasks) != 0, "No tasks" assert self.strategy in ["round-robin", "gradient"] @@ -474,7 +475,7 @@ def pre_tune(self, task_scheduler, task_id): task_id: int The task ID going to be tuned. """ - pass + # Do nothing by default def post_tune(self, task_scheduler, task_id): """The callback after tuning each task. @@ -486,10 +487,10 @@ def post_tune(self, task_scheduler, task_id): task_id: int The task ID be tuned. """ - pass + # Do nothing by default -class PrintTableInfoCallback(TaskSchedulerCallback): +class PrintTableInfo(TaskSchedulerCallback): """The callback that prints a table of current progress.""" def pre_tune(self, task_scheduler, task_id): @@ -534,7 +535,7 @@ def pre_tune(self, task_scheduler, task_id): ) -class LogEstimatedLatencyCallback(TaskSchedulerCallback): +class LogEstimatedLatency(TaskSchedulerCallback): """Log the estimated latency to the file after tuning a task. Parameters diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/tutorials/auto_scheduler/tune_network_cuda.py index 5b8fbc5f0f91..e13fa0a79dce 100644 --- a/tutorials/auto_scheduler/tune_network_cuda.py +++ b/tutorials/auto_scheduler/tune_network_cuda.py @@ -168,9 +168,10 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"): # * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement. # This can warmup the GPU, which is necessary to get accurate measurement results. # Typically, we recommend a value > 300 ms. -# * :code:`callbacks` could be a list of task scheduler callbacks during the tuning. You can use -# PrintTableInfoCallback to print a timely progress table. You can also use -# LogEstimatedLatencyCallback to log the estimated total latency to a file for better analysis. +# * :code:`callbacks` could be a list of task scheduler callbacks during the tuning. +# You can use callbacks such as LogEstimatedLatency to log the estimated total latency +# to a file for better analysis. When not specified, PrintTableInfo is used to print +# a timely progress table. # * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning. # You can set it to a small number (e.g., 200) for a fast demonstrative run. # In practice, we recommend setting it around :code:`1000 * len(tasks)`, @@ -189,9 +190,7 @@ def run_tuning(): print("Begin tuning...") measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=400, timeout=10) - tuner = auto_scheduler.TaskScheduler( - tasks, task_weights, callbacks=[auto_scheduler.task_scheduler.PrintTableInfoCallback()] - ) + tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials=200, # change this to 20000 to achieve the best performance runner=measure_ctx.runner, From 65ab5da0809cecc859a0f5236de99f772d9afca7 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 23 Nov 2020 11:31:06 -0800 Subject: [PATCH 4/5] Delete the explaination of callback in the tutorial --- tutorials/auto_scheduler/tune_network_cuda.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/tutorials/auto_scheduler/tune_network_cuda.py index e13fa0a79dce..723b8d15ea88 100644 --- a/tutorials/auto_scheduler/tune_network_cuda.py +++ b/tutorials/auto_scheduler/tune_network_cuda.py @@ -168,10 +168,6 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"): # * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement. # This can warmup the GPU, which is necessary to get accurate measurement results. # Typically, we recommend a value > 300 ms. -# * :code:`callbacks` could be a list of task scheduler callbacks during the tuning. -# You can use callbacks such as LogEstimatedLatency to log the estimated total latency -# to a file for better analysis. When not specified, PrintTableInfo is used to print -# a timely progress table. # * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning. # You can set it to a small number (e.g., 200) for a fast demonstrative run. # In practice, we recommend setting it around :code:`1000 * len(tasks)`, From 9e317ded83d758ac7dee694f69d8cf9d37322d9f Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Mon, 23 Nov 2020 19:41:38 +0000 Subject: [PATCH 5/5] fix --- python/tvm/auto_scheduler/task_scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py index b6eeaf44abe4..de11fc1b5b11 100644 --- a/python/tvm/auto_scheduler/task_scheduler.py +++ b/python/tvm/auto_scheduler/task_scheduler.py @@ -203,7 +203,7 @@ def __init__( self.beta = beta self.gamma = gamma self.backward_window_size = backward_window_size - self.callbacks = [PrintTableInfo()] if callbacks is not None else [] + self.callbacks = callbacks if callbacks is not None else [PrintTableInfo()] assert len(self.tasks) != 0, "No tasks" assert self.strategy in ["round-robin", "gradient"]