From e0363a01dd854d4edf595116e1a75f69b502819b Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Sat, 21 Nov 2020 02:37:52 +0000
Subject: [PATCH 1/5] [AutoScheduler] Task scheduler callbacks

---
 python/tvm/auto_scheduler/task_scheduler.py   | 149 ++++++++++++++----
 tutorials/auto_scheduler/tune_network_cuda.py |   7 +-
 2 files changed, 123 insertions(+), 33 deletions(-)

diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py
index 884741bd08cc..d6583f59322a 100644
--- a/python/tvm/auto_scheduler/task_scheduler.py
+++ b/python/tvm/auto_scheduler/task_scheduler.py
@@ -22,7 +22,7 @@
 L. Zheng, C. Jia, M. Sun, Z. Wu, C. Yu, et al. "Ansor : Generating High-Performance Tensor
 Programs for Deep Learning." (OSDI 2020).
 """
-
+import os
 import time
 import math
 import logging
@@ -182,6 +182,7 @@ def __init__(
         beta: float = 2,
         gamma: float = 0.5,
         backward_window_size: int = 3,
+        callbacks=None,
     ):
         self.tasks = tasks
         if objective_func:  # use custom objective function
@@ -199,6 +200,7 @@ def __init__(
         self.beta = beta
         self.gamma = gamma
         self.backward_window_size = backward_window_size
+        self.callbacks = callbacks if callbacks is not None else []
 
         assert len(self.tasks) != 0, "No tasks"
         assert self.strategy in ["round-robin", "gradient"]
@@ -374,39 +376,12 @@ def tune(self, tune_option, search_policy="default"):
                     )
                 break
 
-    def _print_table_info(self, next_task_idx):
-        # table header
-        _ffi_api.PrintTitle("Task Scheduler")
-        print("|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |")
-        print("-------------------------------------------------")
-
-        # content
-        for i in range(len(self.tasks)):
-            id_str = "%d" % i
-            latency_str = "%.3f" % (1e3 * self.best_costs[i]) if self.best_costs[i] < 1e9 else "-"
-            speed_str = (
-                "%.2f" % (self.tasks[i].compute_dag.flop_ct / self.best_costs[i] / 1e9)
-                if self.best_costs[i] < 1e9
-                else "-"
-            )
-            trials_str = "%d" % (self.task_cts[i] * self.num_measures_per_round)
-            print("| %4s | %12s | % 14s | %6s |" % (id_str, latency_str, speed_str, trials_str))
-        print("-------------------------------------------------")
-
-        # overall info
-        if all(cost < 1e9 for cost in self.best_costs):
-            total_latency_str = "%.3f" % (self.cur_score * 1e3)
-        else:
-            total_latency_str = "-"
-        print(
-            "Estimated total latency: %s ms\tTrials: %d\tUsed time : %.0f s\tNext ID: %d\t"
-            % (total_latency_str, self.ct, time.time() - self.tic, next_task_idx)
-        )
-
     def _tune_task(self, task_idx):
         """Tune the select task for one round"""
-        if self.tune_option.verbose >= 1:
-            self._print_table_info(task_idx)
+
+        # Run pre-tune callbacks
+        for callback in self.callbacks:
+            callback.pre_tune(self, task_idx)
 
         measure_inputs, measure_results = self.search_policies[task_idx].continue_search_one_round(
             self.num_measures_per_round, self.measurer
@@ -426,6 +401,10 @@ def _tune_task(self, task_idx):
         self.ct += len(measure_inputs)
         self.cur_score = self._compute_score(self.best_costs)
 
+        # Run post-tune callbacks
+        for callback in self.callbacks:
+            callback.post_tune(self, task_idx)
+
     def _compute_score(self, costs):
         """compute the objective function"""
         return self.objective_func(costs)
@@ -478,3 +457,109 @@ def _restore_status(self, log_file, num_measures_per_round):
         self.cur_score = self._compute_score(self.best_costs)
 
         logger.info("TaskScheduler: Loaded %d measurement records from %s", total_ct + 1, log_file)
+
+
+class TaskSchedulerCallback:
+    """The base class of task scheduler callback functions. """
+
+    def pre_tune(self, task_scheduler, task_id):
+        """The callback before tuning each task.
+
+        Parameters
+        ----------
+        task_scheduler: TaskScheduler
+            The task scheduler.
+        task_id: int
+            The task ID going to be tuned.
+        """
+        pass
+
+    def post_tune(self, task_scheduler, task_id):
+        """The callback after tuning each task.
+
+        Parameters
+        ----------
+        task_scheduler: TaskScheduler
+            The task scheduler.
+        task_id: int
+            The task ID be tuned.
+        """
+        pass
+
+
+class PrintTableInfoCallback(TaskSchedulerCallback):
+    """The callback that prints a table of current progress."""
+
+    def pre_tune(self, task_scheduler, task_id):
+        if task_scheduler.tune_option.verbose < 1:
+            return
+
+        _ffi_api.PrintTitle("Task Scheduler")
+        print("|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |")
+        print("-------------------------------------------------")
+
+        # content
+        for i in range(len(task_scheduler.tasks)):
+            id_str = "%d" % i
+            latency_str = (
+                "%.3f" % (1e3 * task_scheduler.best_costs[i])
+                if task_scheduler.best_costs[i] < 1e9
+                else "-"
+            )
+            speed_str = (
+                "%.2f"
+                % (task_scheduler.tasks[i].compute_dag.flop_ct / task_scheduler.best_costs[i] / 1e9)
+                if task_scheduler.best_costs[i] < 1e9
+                else "-"
+            )
+            trials_str = "%d" % (task_scheduler.task_cts[i] * task_scheduler.num_measures_per_round)
+            print("| %4s | %12s | % 14s | %6s |" % (id_str, latency_str, speed_str, trials_str))
+        print("-------------------------------------------------")
+
+        # overall info
+        if all(cost < 1e9 for cost in task_scheduler.best_costs):
+            total_latency_str = "%.3f" % (task_scheduler.cur_score * 1e3)
+        else:
+            total_latency_str = "-"
+        print(
+            "Estimated total latency: %s ms\tTrials: %d\tUsed time : %.0f s\tNext ID: %d\t"
+            % (
+                total_latency_str,
+                task_scheduler.ct,
+                time.time() - task_scheduler.tic,
+                task_id,
+            )
+        )
+
+
+class LogEstimatedLatencyCallback(TaskSchedulerCallback):
+    """Log the estimated latency to the file after tuning a task.
+
+    Parameters
+    ----------
+    log_file: str
+        The log file path.
+    """
+
+    def __init__(self, log_file):
+        if os.path.exists(log_file):  # Remove existing log
+            os.remove(log_file)
+
+        self.log_file = log_file
+
+    def post_tune(self, task_scheduler, task_id):
+        if all(cost < 1e9 for cost in task_scheduler.best_costs):
+            total_latency_str = "%.3f" % (task_scheduler.cur_score * 1e3)
+        else:
+            total_latency_str = "N/A"
+
+        with open(self.log_file, "a") as filep:
+            filep.write(
+                "ElapsedTime(s)\t%.0f\tEstimatedLatency(ms)\t%s\tTrials\t%d\n"
+                % (
+                    time.time() - task_scheduler.tic,
+                    total_latency_str,
+                    task_scheduler.ct,
+                )
+            )
+            filep.flush()
diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/tutorials/auto_scheduler/tune_network_cuda.py
index 723b8d15ea88..5b8fbc5f0f91 100644
--- a/tutorials/auto_scheduler/tune_network_cuda.py
+++ b/tutorials/auto_scheduler/tune_network_cuda.py
@@ -168,6 +168,9 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 # * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
 #   This can warmup the GPU, which is necessary to get accurate measurement results.
 #   Typically, we recommend a value > 300 ms.
+# * :code:`callbacks` could be a list of task scheduler callbacks during the tuning. You can use
+#   PrintTableInfoCallback to print a timely progress table. You can also use
+#   LogEstimatedLatencyCallback to log the estimated total latency to a file for better analysis.
 # * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
 #   You can set it to a small number (e.g., 200) for a fast demonstrative run.
 #   In practice, we recommend setting it around :code:`1000 * len(tasks)`,
@@ -186,7 +189,9 @@ def run_tuning():
     print("Begin tuning...")
     measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=400, timeout=10)
 
-    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+    tuner = auto_scheduler.TaskScheduler(
+        tasks, task_weights, callbacks=[auto_scheduler.task_scheduler.PrintTableInfoCallback()]
+    )
     tune_option = auto_scheduler.TuningOptions(
         num_measure_trials=200,  # change this to 20000 to achieve the best performance
         runner=measure_ctx.runner,

From 6b340303a9a225e7842ded9a28c72566f5ea357f Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Sat, 21 Nov 2020 02:40:16 +0000
Subject: [PATCH 2/5] docstring

---
 python/tvm/auto_scheduler/task_scheduler.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py
index d6583f59322a..c133823b65ae 100644
--- a/python/tvm/auto_scheduler/task_scheduler.py
+++ b/python/tvm/auto_scheduler/task_scheduler.py
@@ -168,6 +168,8 @@ class TaskScheduler:
         The parameter used for 'gradient' strategy
     backward_window_size: int = 3
         The parameter used for 'gradient' strategy
+    callbacks: Optional[List[TaskSchedulerCallback]]
+        The task scheduler callbacks that will be called before and after tuning a task.
     """
 
     def __init__(

From 2880e6bb1df60e3b03554337e3a6b1f6cd14fc82 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Mon, 23 Nov 2020 18:47:41 +0000
Subject: [PATCH 3/5] address comments

---
 python/tvm/auto_scheduler/task_scheduler.py   | 11 ++++++-----
 tutorials/auto_scheduler/tune_network_cuda.py | 11 +++++------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py
index c133823b65ae..b6eeaf44abe4 100644
--- a/python/tvm/auto_scheduler/task_scheduler.py
+++ b/python/tvm/auto_scheduler/task_scheduler.py
@@ -170,6 +170,7 @@ class TaskScheduler:
         The parameter used for 'gradient' strategy
     callbacks: Optional[List[TaskSchedulerCallback]]
         The task scheduler callbacks that will be called before and after tuning a task.
+        If None, then PrintTableInfo callback will be used.
     """
 
     def __init__(
@@ -202,7 +203,7 @@ def __init__(
         self.beta = beta
         self.gamma = gamma
         self.backward_window_size = backward_window_size
-        self.callbacks = callbacks if callbacks is not None else []
+        self.callbacks = [PrintTableInfo()] if callbacks is not None else []
 
         assert len(self.tasks) != 0, "No tasks"
         assert self.strategy in ["round-robin", "gradient"]
@@ -474,7 +475,7 @@ def pre_tune(self, task_scheduler, task_id):
         task_id: int
             The task ID going to be tuned.
         """
-        pass
+        # Do nothing by default
 
     def post_tune(self, task_scheduler, task_id):
         """The callback after tuning each task.
@@ -486,10 +487,10 @@ def post_tune(self, task_scheduler, task_id):
         task_id: int
             The task ID be tuned.
         """
-        pass
+        # Do nothing by default
 
 
-class PrintTableInfoCallback(TaskSchedulerCallback):
+class PrintTableInfo(TaskSchedulerCallback):
     """The callback that prints a table of current progress."""
 
     def pre_tune(self, task_scheduler, task_id):
@@ -534,7 +535,7 @@ def pre_tune(self, task_scheduler, task_id):
         )
 
 
-class LogEstimatedLatencyCallback(TaskSchedulerCallback):
+class LogEstimatedLatency(TaskSchedulerCallback):
     """Log the estimated latency to the file after tuning a task.
 
     Parameters
diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/tutorials/auto_scheduler/tune_network_cuda.py
index 5b8fbc5f0f91..e13fa0a79dce 100644
--- a/tutorials/auto_scheduler/tune_network_cuda.py
+++ b/tutorials/auto_scheduler/tune_network_cuda.py
@@ -168,9 +168,10 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 # * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
 #   This can warmup the GPU, which is necessary to get accurate measurement results.
 #   Typically, we recommend a value > 300 ms.
-# * :code:`callbacks` could be a list of task scheduler callbacks during the tuning. You can use
-#   PrintTableInfoCallback to print a timely progress table. You can also use
-#   LogEstimatedLatencyCallback to log the estimated total latency to a file for better analysis.
+# * :code:`callbacks` could be a list of task scheduler callbacks during the tuning.
+#   You can use callbacks such as LogEstimatedLatency to log the estimated total latency
+#   to a file for better analysis. When not specified, PrintTableInfo is used to print
+#   a timely progress table.
 # * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
 #   You can set it to a small number (e.g., 200) for a fast demonstrative run.
 #   In practice, we recommend setting it around :code:`1000 * len(tasks)`,
@@ -189,9 +190,7 @@ def run_tuning():
     print("Begin tuning...")
     measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=400, timeout=10)
 
-    tuner = auto_scheduler.TaskScheduler(
-        tasks, task_weights, callbacks=[auto_scheduler.task_scheduler.PrintTableInfoCallback()]
-    )
+    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
     tune_option = auto_scheduler.TuningOptions(
         num_measure_trials=200,  # change this to 20000 to achieve the best performance
         runner=measure_ctx.runner,

From 65ab5da0809cecc859a0f5236de99f772d9afca7 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 23 Nov 2020 11:31:06 -0800
Subject: [PATCH 4/5] Delete the explaination of callback in the tutorial

---
 tutorials/auto_scheduler/tune_network_cuda.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/tutorials/auto_scheduler/tune_network_cuda.py
index e13fa0a79dce..723b8d15ea88 100644
--- a/tutorials/auto_scheduler/tune_network_cuda.py
+++ b/tutorials/auto_scheduler/tune_network_cuda.py
@@ -168,10 +168,6 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 # * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
 #   This can warmup the GPU, which is necessary to get accurate measurement results.
 #   Typically, we recommend a value > 300 ms.
-# * :code:`callbacks` could be a list of task scheduler callbacks during the tuning.
-#   You can use callbacks such as LogEstimatedLatency to log the estimated total latency
-#   to a file for better analysis. When not specified, PrintTableInfo is used to print
-#   a timely progress table.
 # * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
 #   You can set it to a small number (e.g., 200) for a fast demonstrative run.
 #   In practice, we recommend setting it around :code:`1000 * len(tasks)`,

From 9e317ded83d758ac7dee694f69d8cf9d37322d9f Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Mon, 23 Nov 2020 19:41:38 +0000
Subject: [PATCH 5/5] fix

---
 python/tvm/auto_scheduler/task_scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py
index b6eeaf44abe4..de11fc1b5b11 100644
--- a/python/tvm/auto_scheduler/task_scheduler.py
+++ b/python/tvm/auto_scheduler/task_scheduler.py
@@ -203,7 +203,7 @@ def __init__(
         self.beta = beta
         self.gamma = gamma
         self.backward_window_size = backward_window_size
-        self.callbacks = [PrintTableInfo()] if callbacks is not None else []
+        self.callbacks = callbacks if callbacks is not None else [PrintTableInfo()]
 
         assert len(self.tasks) != 0, "No tasks"
         assert self.strategy in ["round-robin", "gradient"]