tlc-pack · junrushao · Dec 8, 2021 · Dec 7, 2021 · Dec 7, 2021 · Dec 7, 2021
@@ -543,7 +543,8 @@ def print_best(self, log_file, print_mode="schedule"):
         code: str
             The best schedule code in python API or CUDA source code
         """
-        inp, _ = load_best_record(log_file, self.workload_key)
+        inp, res = load_best_record(log_file, self.workload_key)
+        print("Best codes (ms):", [float(c) * 1000.0 for c in res.costs])
         if inp is None:
             raise RuntimeError(
                 "Cannot find any valid schedule for %s in file %s" % (self.workload_key, log_file)

@@ -194,7 +194,10 @@ def workload_key_to_tensors(workload_key):
     assert callable(value)
 
     args = deserialize_args(workload[1:])
-    return value(*args)
+    result = value(*args)
+    if isinstance(result, tuple):
+        result = list(result)
+    return result
 
 
 def serialize_workload_registry_entry(workload_key):

@@ -201,7 +201,7 @@ def default_llvm(
                 postproc.RewriteParallelVectorizeUnroll(),
                 postproc.RewriteReductionBlock(),
             ],
-            mutators=[],
+            mutator_probs=None,
             task_name=task_name,
             rand_state=-1,
             num_threads=None,
@@ -269,7 +269,7 @@ def default_cuda(
                 postproc.RewriteReductionBlock(),
                 postproc.VerifyGPUCode(),
             ],
-            mutators=[],
+            mutator_probs=None,
             task_name=task_name,
             rand_state=-1,
             num_threads=None,

@@ -108,8 +108,16 @@ void TaskSchedulerNode::Tune() {
     CHECK(task->search_strategy.defined())
         << "ValueError: Require `context.search_strategy`, but it is not defined";
     InitializeTask(i);
-    task->search_strategy.value()->PreTuning(
-        task->space_generator.value()->GenerateDesignSpace(task->mod.value()));
+    Array<tir::Schedule> design_spaces =
+        task->space_generator.value()->GenerateDesignSpace(task->mod.value());
+    LOG(INFO) << "Total " << design_spaces.size() << " design space(s) generated";
+    for (int i = 0, n = design_spaces.size(); i < n; ++i) {
+      const tir::Schedule& sch = design_spaces[i];
+      LOG(INFO) << "Design space #" << i << ":\n"  //
+                << tir::AsTVMScript(sch->mod()) << "\n"
+                << Concat(sch->trace().value()->AsPython(false), "\n");
+    }
+    task->search_strategy.value()->PreTuning(design_spaces);
   }
 
   int running_tasks = tasks.size();

@@ -70,6 +70,20 @@ Target TargetTag::AddTag(String name, Map<String, ObjectRef> config, bool overri
 
 /**********  Register Target tags  **********/
 
+TVM_REGISTER_TARGET_TAG("raspberry-pi/4b-64")
+    .set_config({{"kind", String("llvm")},
+                 {"mtriple", String("aarch64-linux-gnu")},
+                 {"mcpu", String("cortex-a72")},
+                 {"mattr", Array<String>{"+neon"}},
+                 {"num-cores", Integer(4)},
+                 {"host", Map<String, ObjectRef>{
+                              {"kind", String("llvm")},
+                              {"mtriple", String("aarch64-linux-gnu")},
+                              {"mcpu", String("cortex-a72")},
+                              {"mattr", Array<String>{"+neon"}},
+                              {"num-cores", Integer(4)},
+                          }}});
+
 #define TVM_REGISTER_CUDA_TAG(Name, Arch, SharedMem, RegPerBlock) \
   TVM_REGISTER_TARGET_TAG(Name).set_config({                      \
       {"kind", String("cuda")},                                   \

@@ -78,9 +78,10 @@ class ScopeReplacer : public StmtMutator {
   bool found_;
 };
 
-class BufferReplacer : public StmtExprMutator {
+class ReadWriteAtBufferReplacer : public StmtExprMutator {
  public:
-  explicit BufferReplacer(const Buffer& src, const Buffer& dst, Map<Block, Block>* block_sref_reuse)
+  explicit ReadWriteAtBufferReplacer(const Buffer& src, const Buffer& dst,
+                                     Map<Block, Block>* block_sref_reuse)
       : src_(src), dst_(dst), block_sref_reuse_(block_sref_reuse) {}
 
  private:
@@ -245,7 +246,7 @@ struct ReadWriteAtImpl {
       domain.push_back(Range::FromMinExtent(min, extent));
     }
     // Step 4. Insert the auto copy block and replace buffers
-    BufferReplacer replacer(src_, dst_, &block_sref_reuse_);
+    ReadWriteAtBufferReplacer replacer(src_, dst_, &block_sref_reuse_);
     for (int i = st; i < ed; ++i) {
       Stmt stmt = subtrees[i];
       subtrees.Set(i, Stmt(nullptr));

@@ -0,0 +1,40 @@
+set -euxo pipefail
+
+RPC_HOST="192.168.6.66"
+RPC_PORT="4445"
+RPC_KEY="raspi4b-aarch64"
+TARGET="raspberry-pi/4b-64"
+NUM_TRIALS=800
+LOG_DIR=$HOME/logs/ansor-cpu/
+
+mkdir -p $LOG_DIR
+
+run () {
+    name=$1
+    echo "Running workload $name"
+    python tests/python/meta_schedule/test_ansor_cpu.py \
+        --workload "$name"                  \
+        --target "$TARGET"                  \
+        --rpc-host "$RPC_HOST"              \
+        --rpc-port "$RPC_PORT"              \
+        --rpc-key "$RPC_KEY"                \
+        --num-trials "$NUM_TRIALS"          \
+        --log-dir $LOG_DIR                  \
+        2>&1 | tee "$LOG_DIR/$name.log"
+}
+
+# Single op
+run C1D
+run C2D
+run C3D
+run CAP
+run DEP
+run DIL
+run GMM
+run GRP
+run NRM
+run T2D
+# Subgraph
+run C2d-BN-RELU
+run TBG
+
@@ -0,0 +1,38 @@
+set -euxo pipefail
+
+RPC_HOST="192.168.6.66"
+RPC_PORT="4445"
+RPC_KEY="raspi4b-aarch64"
+TARGET="raspberry-pi/4b-64"
+LOG_DIR=$HOME/logs/ms-cpu/
+
+mkdir -p $LOG_DIR
+
+run () {
+    name=$1
+    echo "Running workload $name"
+    python tests/python/meta_schedule/test_tune_te_cpu.py \
+        --workload "$name"                  \
+        --target "$TARGET"                  \
+        --rpc-host "$RPC_HOST"              \
+        --rpc-port "$RPC_PORT"              \
+        --rpc-key "$RPC_KEY"                \
+        --num-trials 1500                   \
+        2>&1 | tee "$LOG_DIR/$name.log"
+}
+
+# Single op
+run C1D
+run C2D
+# run C3D
+run CAP
+run DEP
+run DIL
+run GMM
+run GRP
+# run NRM
+run T2D
+# Subgraph
+run C2d-BN-RELU
+run TBG
+
@@ -0,0 +1,119 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+import argparse
+import os
+
+import tvm
+from tvm import auto_scheduler
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.testing.te_workload import CONFIGS
+
+
+def _parse_args():
+    args = argparse.ArgumentParser()
+    args.add_argument(
+        "--workload",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--target",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--num-trials",
+        type=int,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-host",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-port",
+        type=int,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-key",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--log-dir",
+        type=str,
+        required=True,
+    )
+    parsed = args.parse_args()
+    parsed.target = tvm.target.Target(parsed.target)
+    rpc_config = ms.runner.RPCConfig(
+        tracker_host=parsed.rpc_host,
+        tracker_port=parsed.rpc_port,
+        tracker_key=parsed.rpc_key,
+        session_timeout_sec=60,
+    )
+    parsed.rpc_workers = rpc_config.count_num_servers(allow_missing=False)
+    return parsed
+
+
+ARGS = _parse_args()
+
+
+def main():
+    log_file = os.path.join(ARGS.log_dir, f"{ARGS.workload}.json")
+    workload_func, params = CONFIGS[ARGS.workload]
+    params = params[0]
+    workload_func = auto_scheduler.register_workload(workload_func)
+    task = auto_scheduler.SearchTask(
+        func=workload_func,
+        args=params,
+        target=ARGS.target,
+        hardware_params=auto_scheduler.HardwareParams(
+            num_cores=int(ARGS.target.attrs["num-cores"]),
+            target=ARGS.target,
+        ),
+    )
+    runner = auto_scheduler.RPCRunner(
+        key=ARGS.rpc_key,
+        host=ARGS.rpc_host,
+        port=ARGS.rpc_port,
+        n_parallel=ARGS.rpc_workers,
+    )
+
+    # Inspect the computational graph
+    print("Computational DAG:")
+    print(task.compute_dag)
+    tune_option = auto_scheduler.TuningOptions(
+        num_measure_trials=ARGS.num_trials,
+        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+        verbose=2,
+        runner=runner,
+    )
+    print("Running AutoTuning:")
+    task.tune(tune_option)
+    print("History Best:")
+    print(task.print_best(log_file))
+    sch, args = task.apply_best(log_file)
+    print("Lowered TIR:")
+    print(tvm.lower(sch, args, simple_mode=True))
+
+
+if __name__ == "__main__":
+    main()